Compute the normalized cross correlation

Dans la case qui suit j'ai écrit le code en supposant que les données venaient d'un dataset

In [None]:
"""
Calcul corrélation croisée normalisée entre les données d'appareils et une fonction pour regrouper
les appareils corrélés en "super appareils"

"""


# Import librairies
import numpy as np
import pandas as pd


def compute_ncc(signal1, signal2):
    """Compute Normalized Cross-Correlation between two signals"""
    mean1 = np.mean(signal1)
    mean2 = np.mean(signal2)
    signal1_centered = signal1 - mean1
    signal2_centered = signal2 - mean2
    ncc = np.sum(signal1_centered * signal2_centered) / (np.linalg.norm(signal1_centered) * np.linalg.norm(signal2_centered))
    return ncc

def extend_to_same_length(signal1, signal2):
    """Extend the shorter signal by repeating its mean value until both signals have the same length"""
    len1, len2 = len(signal1), len(signal2)
    if len1 == len2:
        return signal1, signal2
    
    if len1 < len2:
        mean_val = np.mean(signal1)
        extended_signal = np.pad(signal1, (0, len2 - len1), 'constant', constant_values=(mean_val,))
        return extended_signal, signal2
    else:
        mean_val = np.mean(signal2)
        extended_signal = np.pad(signal2, (0, len1 - len2), 'constant', constant_values=(mean_val,))
        return signal1, extended_signal

def cluster_devices(device_signals, ncc_threshold):
    """Cluster devices into super device groups based on NCC"""
    device_graph = {}
    for i, sig1 in enumerate(device_signals):
        for j, sig2 in enumerate(device_signals):
            if i != j:
                sig1_ext, sig2_ext = extend_to_same_length(sig1, sig2)
                ncc = compute_ncc(sig1_ext, sig2_ext)
                if ncc >= ncc_threshold:
                    device_graph.setdefault(i, []).append(j)
                    device_graph.setdefault(j, []).append(i)
    
    super_device_groups = []
    visited = set()
    for node in device_graph:
        if node not in visited:
            group = [node]
            queue = [node]
            visited.add(node)
            while queue:
                curr = queue.pop(0)
                neighbors = device_graph.get(curr, [])
                for neighbor in neighbors:
                    if neighbor not in visited:
                        group.append(neighbor)
                        queue.append(neighbor)
                        visited.add(neighbor)
            super_device_groups.append(group)
    
    # Add isolated nodes
    all_nodes = set(range(len(device_signals)))
    isolated_nodes = all_nodes - set(visited)
    for node in isolated_nodes:
        super_device_groups.append([node])
    
    return super_device_groups

# Reading data from the CSV file
def read_device_signals(csv_file):
    df = pd.read_csv(csv_file)
    device_signals = [df[col].dropna().values for col in df.columns if col != 'number_of_states']
    return device_signals

# Utilisation
csv_file = 'device_signals.csv'  
ncc_threshold = 0.8

device_signals = read_device_signals(csv_file)
super_device_groups = cluster_devices(device_signals, ncc_threshold)
print("Super device groups:", super_device_groups)


Dans ce cas de figure ci, j'ai utilisé des données aléatoires pour exécuter le code

In [51]:
"""
Calcul corrélation croisée normalisée entre les données d'appareils et une fonction pour regrouper
les appareils corrélés en "super appareils"

"""


# Import librairies
import numpy as np
import pandas as pd


def compute_ncc(signal1, signal2):
    """Compute Normalized Cross-Correlation between two signals"""
    mean1 = np.mean(signal1)
    mean2 = np.mean(signal2)
    signal1_centered = signal1 - mean1
    signal2_centered = signal2 - mean2
    ncc = np.sum(signal1_centered * signal2_centered) / (np.linalg.norm(signal1_centered) * np.linalg.norm(signal2_centered))
    return ncc

def extend_to_same_length(signal1, signal2):
    """Extend the shorter signal by repeating its mean value until both signals have the same length"""
    len1, len2 = len(signal1), len(signal2)
    if len1 == len2:
        return signal1, signal2
    
    if len1 < len2:
        mean_val = np.mean(signal1)
        extended_signal = np.pad(signal1, (0, len2 - len1), 'constant', constant_values=(mean_val,))
        return extended_signal, signal2
    else:
        mean_val = np.mean(signal2)
        extended_signal = np.pad(signal2, (0, len1 - len2), 'constant', constant_values=(mean_val,))
        return signal1, extended_signal
    
def cluster_devices(device_signals, ncc_threshold):
    """Cluster devices into super device groups based on NCC"""
    super_device_groups = []
    visited = set()
    
    for i, sig1 in enumerate(device_signals):
        if i not in visited:
            for j, sig2 in enumerate(device_signals):
                if i != j and j not in visited:
                    ncc = compute_ncc(sig1, sig2)
                    if ncc > ncc_threshold:
                        super_device_groups.append([i, j])
                        visited.add(i)
                        visited.add(j)
                        break
            else:
                # Si aucun appareil n'est suffisamment corrélé, ajouter l'appareil seul
                if i not in visited:
                    super_device_groups.append([i])
                    visited.add(i)
    
    return super_device_groups

# Using
device_signals = [
    [0.1, 0.2, 0.3, 0.4],
    [0.3, 0.4, 0.1, 0.2],
    [0.5, 0.6, 0.7, 0.8],
    [0.4, 0.3, 0.2, 0.1],
    [0.8, 1.5, 0.5, 0.9]
]
ncc_threshold = 0.8
super_device_groups = cluster_devices(device_signals, ncc_threshold)
print("Super device groups:", super_device_groups)

print("=" *52)
   
device_names = ["device1", "device2", "device3", "device4", "device5"]

# Calcul de la matrice NCC
ncc_matrix = compute_ncc_matrix(device_signals)

# Stockage dans un DataFrame
df_ncc = pd.DataFrame(ncc_matrix, index=device_names, columns=device_names)

# Affichage du DataFrame
print(df_ncc.round(3))

# Sauvegarde dans un fichier CSV
output_csv = 'results_ncc.csv'
df_ncc.to_csv(output_csv, float_format='%.3f', na_rep='NaN')

print("=" *52)

print("NCC results have been saved to:", output_csv)    


Super device groups: [[0, 2], [1, 4], [3]]
         device1  device2  device3  device4  device5
device1      NaN   -0.600    1.000   -1.000   -0.216
device2   -0.600      NaN   -0.600    0.600    0.893
device3    1.000   -0.600      NaN   -1.000   -0.216
device4   -1.000    0.600   -1.000      NaN    0.216
device5   -0.216    0.893   -0.216    0.216      NaN
NCC results have been saved to: results_ncc.csv
