In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import glob
from scipy.signal import butter, filtfilt
from scipy.signal import resample_poly
import librosa
from scipy.io import loadmat
import warnings
from scipy.io import savemat


**FASE DI PREPROCESSING E REORDERING DEI FILE IN COLONNE**

***

In [None]:
#riduzione dataset seed

input_path = '/kaggle/input/seed-dataset/Preprocessed_EEG'
output_path = '/kaggle/working/reduced_dataset'


# Crea la cartella di output
os.makedirs(output_path, exist_ok=True)

# Parametri per la riduzione
channel_reduction_ratio = 3  # Dividi il numero di canali per 3
sample_reduction_ratio = 2   # Dividi il numero di campioni per 2

mat_files = [f for f in os.listdir(input_path) if f.endswith('.mat') and f != 'label.mat']
print(f"Trovati {len(mat_files)} file .mat nella directory.")

for file_name in mat_files:
    file_path = os.path.join(input_path, file_name)
    print(f"Processando il file: {file_name}")
    

    mat_data = loadmat(file_path)
    
    # Trova tutte le chiavi che rappresentano trial (chiavi con strutture comuni come djc_eeg, ys_eeg, ecc.)
    trial_keys = [key for key in sorted(mat_data.keys()) if not key.startswith('__') and isinstance(mat_data[key], np.ndarray)]
    print(f"Trial trovati in {file_name}: {trial_keys}")
    
    if not trial_keys:
        print(f"Nessun trial trovato in {file_name}, salto il file.")
        continue
    
    # Seleziona casualmente un terzo dei trial
    random.seed(42)
    selected_keys = random.sample(trial_keys, len(trial_keys) // 3)
    
    for key in selected_keys:  # Itera SOLO sui trial selezionati
        data = mat_data[key]
        
        # Riduzione del numero di canali
        num_channels = data.shape[0]
        reduced_channels = num_channels // channel_reduction_ratio
        reduced_data = data[:reduced_channels, :]
        
        # Riduzione del numero di campioni
        num_samples = reduced_data.shape[1]
        reduced_samples = num_samples // sample_reduction_ratio
        reduced_data = reduced_data[:, :reduced_samples]
        
        # Salva i dati ridotti
        output_file = os.path.join(output_path, f"{file_name.replace('.mat', '')}_{key}.npz")
        np.savez(output_file, data=reduced_data)

print(f"Dataset ridotto salvato in: {output_path}")


In [None]:
#preprocessamento dataset seed

folder_path = '/kaggle/working/reduced_dataset'
file_paths = glob.glob(f"{folder_path}/*.npz")
folder1_path = '/kaggle/working/preprocessed'
os.makedirs(folder1_path, exist_ok=True)

warnings.filterwarnings("ignore")

# Funzione per il filtro band-pass Butterworth
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

# Funzione per plottare i segnali
def plot_signals(original_signal, cleaned_signal, channel_idx, file_name):
    plt.figure(figsize=(15, 10))

    plt.subplot(2, 1, 1)
    plt.plot(original_signal, label='Originale', color='blue')
    plt.title(f"{file_name} - Canale {channel_idx} (Originale)")
    plt.xlabel("Campioni")
    plt.ylabel("Ampiezza")
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(cleaned_signal, label='Pulito (±500 µV)', color='red')
    plt.title(f"{file_name} - Canale {channel_idx} (Pulito)")
    plt.xlabel("Campioni")
    plt.ylabel("Ampiezza")
    plt.legend()

    plt.tight_layout()
    plt.show()


for file_path in file_paths:
    try:
        # Carica il file .mat
        file_name = os.path.basename(file_path).replace('.npz', '')
        print(f"Processando il file: {file_name}")

        npz_data = np.load(file_path)
        
        # Itera sulle chiavi del file .npz
        for key in npz_data.keys():
            data = npz_data[key]
            print(f"Elaborazione della chiave: {key} - Forma: {data.shape}")
            

            # Preprocessing dei canali
            processed_channels = []
            for channel_idx in range(data.shape[0]):
                original_signal = data[channel_idx, :]

                # Rimozione valori superiori a ±500 µV
                cleaned_signal = original_signal.copy()
                cleaned_signal[np.abs(cleaned_signal) > 500] = 0  # Sostituisci con 0

                #plot_signals_all_channels(original_signal, cleaned_signal, channel_idx, file_name)

                processed_channels.append(cleaned_signal)

            processed_data = np.array(processed_channels)
            print(f"Dati processati per la chiave {key}: forma {processed_data.shape}")

            output_path = os.path.join(folder1_path, f"processed_{file_name}_{key}.npy")
            np.save(output_path, processed_data)
            print(f"Dati salvati in formato NumPy: {output_path}")

    except Exception as e:
        print(f"Errore durante il preprocessing del file {file_path}: {e}")


In [None]:
#Preprocessamento dataset dataverse
# Funzione per il filtro band-pass Butterworth
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

# Funzione per applicare il filtro ai dati
def butter_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order)
    return filtfilt(b, a, data)

# Funzione per il resampling usando librosa
def librosa_resample(data, original_fs, target_fs):
    # Controlla e gestisce NaN o inf nel segnale
    data = np.nan_to_num(data)  # Sostituisci NaN con 0 e inf con valori finiti
    return librosa.resample(data, orig_sr=original_fs, target_sr=target_fs, res_type='soxr_hq')


folder_path = '/kaggle/input/dataverse-files/dataverse_files'
folder1_path = '/kaggle/working/preprocessed'
#os.makedirs(folder1_path, exist_ok=True)


file_paths = glob.glob(os.path.join(folder_path, '*.txt'))

# Parametri di frequenza di campionamento target
target_fs = 200  # Frequenza di campionamento desiderata
original_fs = 256
# Parametri per il filtro Butterworth
lowcut = 0.3
highcut = 80.0

# Durata del segnale in secondi
signal_duration = 20  # Preso dal paper, scelgono dataset con 20 sec

for file_path in file_paths:

    file_name = os.path.basename(file_path).replace('.txt', '')
    print(f"Processando il file: {file_name}")


    data = pd.read_csv(file_path, sep='\t', header=None)

    cleaned_signals = []
    for channel_idx in range(data.shape[1]):
        original_signal = data.iloc[:, channel_idx].values  # Estrarre il segnale come array Numpy

        # Rimozione di eventuali valori non numerici prima del resampling
        original_signal = np.nan_to_num(original_signal)

        # Resampling usando librosa
        resampled_signal = librosa_resample(original_signal, original_fs, target_fs)

        # Filtraggio
        filtered_signal = butter_filter(resampled_signal, lowcut, highcut, target_fs)

        # Rimozione valori ±500 µV
        cleaned_signal = filtered_signal.copy()
        cleaned_signal[np.abs(cleaned_signal) > 500] = np.nan  # Sostituisci con NaN
        cleaned_signal = pd.Series(cleaned_signal).interpolate().fillna(0).values  # Interpolazione per riempire i vuoti
        cleaned_signals.append(cleaned_signal)
    # Converte i canali preprocessati in un array NumPy
    cleaned_data = np.array(cleaned_signals)
    print(f"Dati processati forma {cleaned_data.shape}")
    output_path = os.path.join(folder1_path, f"preprocessing_{file_name}.npy")
    np.save(output_path, cleaned_data)
    print(f"File con preprocessing salvato come npy: {output_path}")

In [None]:
# ------------------- Funzioni per Salvare e Caricare Modelli -------------------
save_path = "/kaggle/working/models"
os.makedirs(save_path, exist_ok=True)

def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Modello salvato in {path}")


#tolgo il module 
def load_model(model, path, device="cuda"):
    state_dict = torch.load(path, map_location=device)
    if "module." in list(state_dict.keys())[0]:
        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
    model.load_state_dict(state_dict)
    print(f"Modello caricato da {path}")

**FASE DI TRAINING DELL'ENCODER RICORRENTE E CONVOLUZIONALE**

********

In [None]:
#AUGMENTATION E CHUNK DATI
def min_max_amplitude_scale(data, scale_min=0.5, scale_max=2):
    scale_factor = random.uniform(scale_min, scale_max)
    return data * scale_factor

def time_shift(data, shift_min=-50, shift_max=50):
    shift_samples = random.randint(shift_min, shift_max)
    return np.roll(data, shift_samples)

def dc_shift(data, shift_min=-10, shift_max=10):
    shift_value = random.uniform(shift_min, shift_max)
    return data + shift_value

def zero_masking(data, mask_min=0, mask_max=150):
    mask_size = random.randint(mask_min, mask_max)
    start_idx = random.randint(0, len(data) - mask_size)
    data[start_idx:start_idx+mask_size] = 0
    return data

def add_gaussian_noise(data, sigma_min=0, sigma_max=0.2):
    sigma = random.uniform(sigma_min, sigma_max)
    noise = np.random.normal(0, sigma, len(data))
    return data + noise

def apply_random_transformations_class(channel_data):
    transformations = [min_max_amplitude_scale, time_shift, dc_shift, zero_masking, add_gaussian_noise]
    selected_transform = random.choice(transformations)
    transformed_data = selected_transform(channel_data.copy())
    return transformed_data
    
def apply_random_transformations(channel_data):
    transformations = [min_max_amplitude_scale, time_shift, dc_shift, zero_masking, add_gaussian_noise]
    selected_transforms = random.sample(transformations, 2)
    transformed_data_1 = selected_transforms[0](channel_data.copy())
    transformed_data_2 = selected_transforms[1](channel_data.copy())
    return transformed_data_1, transformed_data_2
    
    
def chunk_data(data, chunk_size=4000):
    """Divide i dati in chunk della dimensione specificata."""
    chunks = []
    num_chunks = len(data) // chunk_size

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunks.append(data[start_idx:end_idx])
    
    return chunks


In [None]:
class ConvolutionalEncoder(nn.Module):
    def __init__(self, input_channels=1, output_dim=4, repeat_blocks=4):
        super(ConvolutionalEncoder, self).__init__()
        
        # Parallel convolutional paths
        self.conv1d_128 = nn.Sequential(
            nn.ReflectionPad1d((63, 64)),
            nn.Conv1d(input_channels, 100, kernel_size=128, stride=1)
        )
        self.conv1d_64 = nn.Sequential(
            nn.ReflectionPad1d((31, 32)),
            nn.Conv1d(input_channels, 100, kernel_size=64, stride=1)
        )
        self.conv1d_16 = nn.Sequential(
            nn.ReflectionPad1d((7, 8)),
            nn.Conv1d(input_channels, 50, kernel_size=16, stride=1)
        )

        # Dense layer to merge paths
        self.concat_dense = nn.Linear(100 + 100 + 50, 250)

        # Repeat N=4 blocks
        self.repeat_blocks = nn.ModuleList([
            nn.Sequential(
                nn.ReLU(),
                nn.BatchNorm1d(250),
                nn.ReflectionPad1d((31, 32)),
                nn.Conv1d(250, 250, kernel_size=64, stride=1)
            ) for _ in range(repeat_blocks)
        ])

        # Final block
        self.final_relu = nn.ReLU()
        self.final_bn = nn.BatchNorm1d(250)
        self.final_conv = nn.Sequential(
            nn.ReflectionPad1d((31, 32)),
            nn.Conv1d(250, output_dim, kernel_size=64, stride=1)
        )
        
    def forward(self, x):
        # Input shape: [batch_size, sequence_length, channels] modify

        
        x = x.permute(0, 2, 1)  #[batch_size, channels, sequence_length]
        
        # Parallel convolutional paths
        x1 = self.conv1d_128(x)
        x2 = self.conv1d_64(x)
        x3 = self.conv1d_16(x)

        # Concatenate paths
        x_cat = torch.cat([x1, x2, x3], dim=1)  # [batch_size, 250, sequence_length]
        
        # Dense layer
        x_dense = self.concat_dense(x_cat.permute(0, 2, 1)).permute(0, 2, 1)

        # Repeated blocks
        x_repeated = x_dense
        for block in self.repeat_blocks:
            x_repeated = x_repeated + block(x_repeated)  # Residual connection

        # Final block
        x_final = self.final_relu(x_repeated)
        x_final = self.final_bn(x_final)
        x_final = self.final_conv(x_final)
        x_final = x_final.permute(0, 2, 1)
        #print("xfinal:", x_final.shape)
        return x_final  # [batch_size, output_dim]

In [None]:
class RecurrentEncoder(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128, repeat_n=2):
        super(RecurrentEncoder, self).__init__()

        self.gru_256 = nn.GRU(input_dim, 256, batch_first=True)
        self.downsample_256_128 = nn.Linear(256, 128)
        self.gru_128 = nn.GRU(128, 128, batch_first=True)
        self.downsample_128_64 = nn.Linear(128, 64)
        self.gru_64 = nn.GRU(64, 64, batch_first=True)

        self.upsample_64_128 = nn.Linear(64, 128)
        self.upsample_128_256 = nn.Linear(128, 256)

        self.concat_dense = nn.Linear(256 + 128 + 64, hidden_dim)

        self.rru = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(hidden_dim),
                nn.GRU(hidden_dim, hidden_dim, batch_first=True)
            ) for _ in range(repeat_n)
        ])

        self.output_dense = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x_256, _ = self.gru_256(x)
        x_128_input = F.relu(self.downsample_256_128(x_256))
        x_128, _ = self.gru_128(x_128_input)
        x_64_input = F.relu(self.downsample_128_64(x_128))
        x_64, _ = self.gru_64(x_64_input)

        x_128_up = F.relu(self.upsample_64_128(x_64))
        x_256_up = F.relu(self.upsample_128_256(x_128_up))

        x_concat = torch.cat([x_256, x_128, x_64], dim=-1)
        x_hidden = F.relu(self.concat_dense(x_concat))

        for i, rru_layer in enumerate(self.rru):
            residual, _ = rru_layer(x_hidden)
            x_hidden = x_hidden + residual

        output = self.output_dense(x_hidden)
        return output


In [None]:
class Projector(nn.Module):
    def __init__(self, input_dim, output_dim=32):
        super(Projector, self).__init__()

        self.downsample_1 = nn.Linear(input_dim, 256)
        self.bilstm_256 = nn.LSTM(256, 128, batch_first=True, bidirectional=True)

        self.downsample_2 = nn.Linear(256, 128)
        self.bilstm_128 = nn.LSTM(128, 64, batch_first=True, bidirectional=True)

        self.downsample_3 = nn.Linear(128, 64)
        self.bilstm_64 = nn.LSTM(64, 32, batch_first=True, bidirectional=True)

        self.concat_dense_1 = nn.Linear(256 + 128 + 64, 128)
        self.concat_dense_2 = nn.Linear(128, output_dim)

    def forward(self, x):
        #print("Input x:", x.shape)

        x_256 = F.relu(self.downsample_1(x))
        #print("Output x_256 (Downsample):", x_256.shape)

        x_256, (h_256, _) = self.bilstm_256(x_256)
        flo_256 = torch.cat([h_256[0], h_256[1]], dim=-1)
        #print("Output flo_256 (BiLSTM):", flo_256.shape)

        x_128 = F.relu(self.downsample_2(x_256))
        #print("Output x_128 (Downsample):", x_128.shape)

        x_128, (h_128, _) = self.bilstm_128(x_128)
        flo_128 = torch.cat([h_128[0], h_128[1]], dim=-1)
        #print("Output flo_128 (BiLSTM):", flo_128.shape)

        x_64 = F.relu(self.downsample_3(x_128))
        #print("Output x_64 (Downsample):", x_64.shape)

        x_64, (h_64, _) = self.bilstm_64(x_64)
        flo_64 = torch.cat([h_64[0], h_64[1]], dim=-1)
        #print("Output flo_64 (BiLSTM):", flo_64.shape)

        x_concat = torch.cat([flo_256, flo_128, flo_64], dim=-1)
        #print("Output x_concat (Concat):", x_concat.shape)

        x_hidden = F.relu(self.concat_dense_1(x_concat))
        #print("Output x_hidden (Dense):", x_hidden.shape)

        output = self.concat_dense_2(x_hidden)
        #print("Final Output projector:", output.shape)
        return output
        
class NTXentLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z_i, z_j, z_neg):
        z_i = F.normalize(z_i, p=2, dim=-1)
        z_j = F.normalize(z_j, p=2, dim=-1)
        z_neg = F.normalize(z_neg, p=2, dim=-1)

        # Similarità
        sim_ij = torch.matmul(z_i, z_j.T) / self.temperature  # Positiva
        sim_neg = torch.matmul(z_i, z_neg.T) / self.temperature  # Negative

        # Calcolo della loss
        numerator = torch.exp(sim_ij.diag())
        denominator = numerator + torch.sum(torch.exp(sim_neg), dim=1)

        loss = -torch.log(numerator / denominator)
        return loss.mean()

In [None]:
#------------------ SEED ENCODER ---------------
import matplotlib.pyplot as plt

# Funzione per plottare segnali trasformati e negativi
def plot_transformed_chunks(data1, data2, negative, idx):
    plt.figure(figsize=(15, 5))

    # Plotta il primo chunk trasformato
    plt.subplot(1, 3, 1)
    plt.plot(data1.squeeze().cpu().numpy(), label='Trasformazione 1', color='blue')
    plt.title(f'Trasformazione 1 - Chunk {idx}')
    plt.xlabel('Campioni')
    plt.ylabel('Ampiezza')
    plt.legend()

    # Plotta il secondo chunk trasformato
    plt.subplot(1, 3, 2)
    plt.plot(data2.squeeze().cpu().numpy(), label='Trasformazione 2', color='green')
    plt.title(f'Trasformazione 2 - Chunk {idx}')
    plt.xlabel('Campioni')
    plt.ylabel('Ampiezza')
    plt.legend()

    # Plotta il chunk negativo
    plt.subplot(1, 3, 3)
    plt.plot(negative.squeeze().cpu().numpy(), label='Chunk Negativo', color='red')
    plt.title(f'Chunk Negativo - Chunk {idx}')
    plt.xlabel('Campioni')
    plt.ylabel('Ampiezza')
    plt.legend()

    plt.tight_layout()
    plt.show()

class CompleteContrastiveDataset(Dataset):
    def __init__(self, file_paths, chunk_size=4000):
        self.all_chunks = []  # Contiene tutti i chunk
        self.chunk_sources = []  # Contiene il file di origine di ogni chunk

        for file_path in file_paths:
            # Carica il file NumPy
            data = np.load(file_path)

            # Dividi ogni canale in chunk
            for channel_idx in range(data.shape[0]):  # Itera su tutti i canali
                channel_data = data[channel_idx]
                chunks = chunk_data(channel_data, chunk_size)

                # Aggiungi i chunk e le loro sorgenti
                self.all_chunks.extend(chunks)
                self.chunk_sources.extend([file_path] * len(chunks))

        print(f"Dataset creato con {len(self.all_chunks)} chunk totali.")

    def __len__(self):
        return len(self.all_chunks)

    def __getitem__(self, idx):
        # Chunk corrispondente
        chunk = self.all_chunks[idx]
        source_file = self.chunk_sources[idx]

        # Crea trasformazioni per le coppie positive
        transformed_1, transformed_2 = apply_random_transformations(chunk)

        # Seleziona un chunk negativo proveniente da un file diverso
        possible_negatives = [
            i for i, src in enumerate(self.chunk_sources) if src != source_file
        ]
        neg_idx = random.choice(possible_negatives)
        negative_chunk = self.all_chunks[neg_idx]

        data1 = torch.tensor(transformed_1, dtype=torch.float32).unsqueeze(-1)
        data2 = torch.tensor(transformed_2, dtype=torch.float32).unsqueeze(-1)
        negative = torch.tensor(negative_chunk, dtype=torch.float32).unsqueeze(-1)

        return data1, data2, negative



# ------------------- Funzione di Addestramento -------------------
def train_contrastive_learning(model_type, file_paths, encoder, projector, optimizer, epochs=10, batch_size=10, device="cuda"):
    loss_fn = NTXentLoss()

    
    dataset = CompleteContrastiveDataset(file_paths, chunk_size=4000)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    
    # Parallelizzazione su più GPU
    encoder = nn.DataParallel(encoder)
    projector = nn.DataParallel(projector)
    encoder.to(device)
    projector.to(device)

    for epoch in range(epochs):
        print(f"=== Epoca {epoch + 1}/{epochs} ===")
        total_loss = 0

        for batch_idx, (data1, data2, neg_data) in enumerate(dataloader):
            data1, data2, neg_data= (
                data1.to(device),
                data2.to(device),
                neg_data.to(device),
            )

            # Passaggio attraverso encoder e projector
            z1 = projector(encoder(data1))
            z2 = projector(encoder(data2))
            z_neg = projector(encoder(neg_data))

            # Calcolo della loss
            loss = loss_fn(z1, z2, z_neg)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            print(f"Batch {batch_idx + 1}/{len(dataloader)}, Perdita: {loss.item():.4f}")
            #plot_transformed_chunks(data1[batch_idx], data2[batch_idx], neg_data[batch_idx], idx=batch_idx)

        print(f"Perdita totale per epoca {epoch + 1}: {total_loss:.4f}")
        save_model(encoder, os.path.join(save_path, f"{model_type}_epoch_{epoch + 1}.pth"))
        save_model(projector, os.path.join(save_path, f"projector_{model_type}_epoch_{epoch + 1}.pth"))


In [None]:
#Esecuzione recurrent encoder training

if __name__ == "__main__":
    folder_path = "/kaggle/working/preprocessed"
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.npy')]

    encoder = RecurrentEncoder(input_dim=1, output_dim=4)
    projector = Projector(input_dim=4, output_dim=32)

    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(projector.parameters()), lr=1e-4)
    train_contrastive_learning( "Recurrent", file_paths, encoder, projector, optimizer, epochs=30, batch_size= 40, device="cuda")

In [None]:
#Esecuzione covnolutional encoder training
if __name__ == "__main__":
    folder_path = "/kaggle/working/preprocessed" 
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.npy')]

    encoder = ConvolutionalEncoder(input_channels=1, output_dim=4)
    projector = Projector(input_dim=4, output_dim=32)

    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(projector.parameters()), lr=1e-4)

    train_contrastive_learning("Convolutional", file_paths, encoder, projector, optimizer, epochs=30, batch_size=40, device="cuda")