## AIM: store synchrony features in a single file suitable for training GCNs

The synchrony features stored as .pkl files per subject will be stored in a single .pkl file for easy use for training the GCNs. This is done at this point in the pipeline, considering we will only select the subsampled participants after feature selection.

In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle
def load_file(file):
    with open(file, 'rb') as handle:
        return pickle.load(handle)

In [4]:
# import stat & conn features without feature selection
df_stat_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_conn_features.pkl')
df_stat_conn_features = df_stat_conn_features.dropna(subset=['diagnosis'])

# obtain participant IDs
df_participants = df_stat_conn_features[df_stat_conn_features['epoch'] == 1]
subject_ids = df_participants['ID'].tolist()
print(subject_ids)
print(len(subject_ids))

['sub-87964717', 'sub-87965301', 'sub-87966337', 'sub-87966473', 'sub-87966517', 'sub-87966789', 'sub-87967057', 'sub-87967061', 'sub-87967325', 'sub-87967417', 'sub-87967509', 'sub-87967729', 'sub-87967773', 'sub-87967781', 'sub-87967869', 'sub-87967957', 'sub-87968045', 'sub-87968229', 'sub-87968405', 'sub-87968449', 'sub-87968541', 'sub-87968677', 'sub-87968765', 'sub-87968901', 'sub-87968945', 'sub-87969125', 'sub-87969349', 'sub-87969529', 'sub-87969573', 'sub-87969665', 'sub-87969805', 'sub-87969849', 'sub-87969937', 'sub-87969985', 'sub-87970297', 'sub-87970345', 'sub-87970389', 'sub-87970705', 'sub-87970881', 'sub-87970969', 'sub-87971021', 'sub-87971109', 'sub-87971197', 'sub-87971241', 'sub-87971373', 'sub-87974617', 'sub-87974621', 'sub-87974665', 'sub-87974709', 'sub-87974841', 'sub-87974973', 'sub-87976193', 'sub-87976369', 'sub-87976413', 'sub-87976457', 'sub-87976461', 'sub-87976505', 'sub-87976641', 'sub-87976773', 'sub-87976817', 'sub-87976953', 'sub-87977045', 'sub-87

In [5]:
# obtain synchrony features for each participant and store in list
from joblib import Parallel, delayed # for parallel processing
import os


def process_file(file):
    if '.pkl' in file: # filter pickle files
        filepath = os.path.join(subdir, file) # path to eeg file

        # load file
        feature = load_file(filepath)
        ID = file.split('_')[1]
        if 'EC' in file:
            subject_bands_syncro_matrix = np.zeros((12, 5, 9, 9))
            for i, band in enumerate(feature['syncros_eeg']):
                for epoch, matrix in enumerate(feature['syncros_eeg'][band]):
                    subject_bands_syncro_matrix[epoch, i] = matrix
            return ('EC_syncro', subject_bands_syncro_matrix, ID)

        if 'EO' in file:
            subject_bands_syncro_matrix = np.zeros((12, 5, 9, 9))
            for i, band in enumerate(feature['syncros_eeg']):
                for epoch, matrix in enumerate(feature['syncros_eeg'][band]):
                    subject_bands_syncro_matrix[epoch, i] = matrix
            return ('EO_syncro', subject_bands_syncro_matrix, ID)
        
feature_dir = r"D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\connectivity_features"

sample_ids = subject_ids # list of participants to include
count = 0
total_files = 0
for _, dirs, files in os.walk(feature_dir):
    total_files += len([file for file in files if '.pkl' in file and 'EC' in file and any(sample_id in file for sample_id in sample_ids)]) # filter participants to include])

subjects_ec_syncro_matrices = []
subjects_eo_syncro_matrices = []
subjects_ec_ids = []
subjects_eo_ids = []

for subdir, dirs, files in os.walk(feature_dir): # iterate through all files
    results = Parallel(n_jobs=-1)(delayed(process_file)(file) for file in files if any(sample_id in file for sample_id in sample_ids)) # filter participants to include)
    for result in results:
        if result is not None:
            if result[0] == 'EC_syncro':
                subjects_ec_syncro_matrices.append(result[1])
                subjects_ec_ids.append(result[2])
            elif result[0] == 'EO_syncro':
                subjects_eo_syncro_matrices.append(result[1])
                subjects_eo_ids.append(result[2])

    print(f'\rProgress: {count}/{total_files} files processed.', end = '')
    count += 1

Progress: 661/224 files processed.

In [6]:
synchrony_feature_dict = {'EC_syncro' : subjects_ec_syncro_matrices, 'EO_syncro' : subjects_eo_syncro_matrices, 'EC_IDs' : subjects_ec_ids, 'EO_IDs' : subjects_eo_ids}
with open(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\synchrony_feature_dict.pkl', 'wb') as handle:
    pickle.dump(synchrony_feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)