# AIM: extract functional connectivity features

Most of the code has been taken from a notebook provided by Federico Zamberlan, but has been adjusted for the TDBRAIN data, and grouping among channels

1. load preprocessed EEG data
2. obtain synchrony between channels per freq band, with phases obtained with Hilbert transform
3. aggregate synchrony between channels to channel groups per freq band
4. store as .pkl file

In [1]:
import numpy as np
import pandas as pd
import os
import mne
from mne.datasets import fetch_fsaverage
from mne.minimum_norm import make_inverse_operator
from mne.minimum_norm import apply_inverse_epochs

from mne.filter import filter_data
from scipy.signal import hilbert

from tqdm import tqdm

from itertools import combinations as combs_without
from itertools import combinations_with_replacement as combs_with

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# prevent extensive logging
mne.set_log_level('WARNING')

In [2]:
import pickle

def save_file(data, folder, file):
    with open(folder+file+".pkl", 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_file(file):
    with open(file, 'rb') as handle:
        return pickle.load(handle)

In [3]:
# load participant data
df_participants = pd.read_pickle('D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TDBRAIN_participants_V2_data\df_participants.pkl')
print(f'all participants: {df_participants.shape}')
df_participants.sample(5)

all participants: (714, 12)


Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,age,gender,sessID,nrSessions,EC,EO,diagnosis
1140,sub-88063129,DISCOVERY,ADHD,UNKNOWN,,16.62,0,1,1,True,True,ADHD
169,sub-87967781,DISCOVERY,SMC,UNKNOWN,,61.51,0,1,2,True,True,SMC
816,sub-88037609,DISCOVERY,ADHD,UNKNOWN,,45.74,0,1,1,True,True,ADHD
1327,sub-88077393,DISCOVERY,ADHD,UNKNOWN,,14.69,1,1,1,True,True,ADHD
712,sub-88026593,DISCOVERY,MDD,UNKNOWN,,43.71,0,1,1,True,True,MDD


In [4]:
## Set montage based on channel names and locations provided in Van Dijk et al., (2022) (Copied from Anne van Duijvenbode)

ch_types = ['eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg',\
           'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', \
           'eog', 'eog', 'eog', 'eog', 'ecg', 'eog', 'emg']

ch_names = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4', 'T7', 'C3', 'Cz', 'C4', 'T8', 'CP3', \
            'CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'Oz', 'O2', 'VPVA', 'VNVB', 'HPHL', 'HNHR', 'Erbs', \
            'OrbOcc', 'Mass']

dict_eeg_channels =  {ch_names[i]: ch_types[i] for i in range(len(ch_types))}

dict_ch_pos = {'Fp1' : [-26.81, 84.06, -10.56],
               'Fp2' : [29.41, 83.74, -10.04],
               'F7'  : [-66.99, 41.69, -15.96],
               'F3'  : [-48.05, 51.87, 39.87],
               'Fz'  : [0.90, 57.01, 66.36],
               'F4'  : [50.38, 51.84, 41.33],
               'F8'  : [68.71, 41.16, -15.31],
               'FC3' : [-58.83, 21.02, 54.82],
               'FCz' : [0.57, 24.63, 87.63],
               'FC4' : [60.29, 21.16, 55.58], 
               'T7'  : [-83.36, -16.52, -12.65], 
               'C3'  : [-65.57, -13.25, 64.98],
               'Cz'  : [0.23, -11.28, 99.81],
               'C4'  : [66.50, -12.80, 65.11],
               'T8'  : [84.44, -16.65, -11.79], 
               'CP3' : [-65.51, -48.48, 68.57],
               'CPz' : [-0.42, -48.77, 98.37], 
               'CP4' : [65.03, -48.35, 68.57], 
               'P7': [-71.46, -75.17, -3.70], 
               'P3'  : [-55.07, -80.11, 59.44], 
               'Pz'  : [-0.87, -82.23, 82.43],
               'P4'  : [53.51, -80.13, 59.40], 
               'P8' : [71.10, -75.17, -3.69], 
               'O1'  : [-28.98, -114.52, 9.67],  
               'Oz'  : [-1.41, -117.79, 15.84],
               'O2'  : [26.89, -114.68, 9.45]
              }

dict_ch_pos_m = {'Fp1' : [-0.2681, 0.8406, -0.1056],
               'Fp2' : [0.2941, 0.8374, -0.1004],
               'F7'  : [-0.6699, 0.4169, -0.1596],
               'F3'  : [-0.4805, 0.5187, 0.3987],
               'Fz'  : [0.0090, 0.5701, 0.6636],
               'F4'  : [0.5038, 0.5184, 0.4133],
               'F8'  : [0.6871, 0.4116, -0.1531],
               'FC3' : [-0.5883, 0.2102, 0.5482],
               'FCz' : [0.0057, 0.2463, 0.8763],
               'FC4' : [0.6029, 0.2116, 0.5558], 
               'T7'  : [-0.8336, -0.1652, -0.1265], 
               'C3'  : [-0.6557, -0.1325, 0.6498],
               'Cz'  : [0.0023, -0.1128, 0.9981],
               'C4'  : [0.6650, -0.1280, 0.6511],
               'T8'  : [0.8444, -0.1665, -0.1179], 
               'CP3' : [-0.6551, -0.4848, 0.6857],
               'CPz' : [-0.042, -0.4877, 0.9837], 
               'CP4' : [0.6503, -0.4835, 0.6857], 
               'P7'  : [-0.7146, -0.7517, -0.0370], 
               'P3'  : [-0.5507, -0.8011, 0.5944], 
               'Pz'  : [-0.0087, -0.8223, 0.8243],
               'P4'  : [0.5351, -0.8013, 0.5940], 
               'P8'  : [0.7110, -0.7517, -0.0369], 
               'O1'  : [-0.2898, -1.1452, 0.0967],  
               'Oz'  : [-0.0141, -1.1779, 0.1584],
               'O2'  : [0.2689, -1.1468, 0.0945]
              }

dict_ch_pos_array = {'Fp1' : np.array([-0.02681, 0.08406, -0.01056]),
               'Fp2' : np.array([0.02941, 0.08374, -0.01004]),
               'F7'  : np.array([-0.06699, 0.04169, -0.01596]),
               'F3'  : np.array([-0.04805, 0.05187, 0.03987]),
               'Fz'  : np.array([0.00090, 0.05701, 0.06636]),
               'F4'  : np.array([0.05038, 0.05184, 0.04133]),
               'F8'  : np.array([0.06871, 0.04116, -0.01531]),
               'FC3' : np.array([-0.05883, 0.02102, 0.05482]),
               'FCz' : np.array([0.00057, 0.02463, 0.08763]),
               'FC4' : np.array([0.06029, 0.02116, 0.05558]), 
               'T7'  : np.array([-0.08336, -0.01652, -0.01265]), 
               'C3'  : np.array([-0.06557, -0.01325, 0.06498]),
               'Cz'  : np.array([0.000023, -0.01128, 0.09981]),
               'C4'  : np.array([0.06650, -0.01280, 0.06511]),
               'T8'  : np.array([0.08444, -0.01665, -0.01179]), 
               'CP3' : np.array([-0.06551, -0.04848, 0.06857]),
               'CPz' : np.array([-0.0042, -0.04877, 0.09837]), 
               'CP4' : np.array([0.06503, -0.04835, 0.06857]), 
               'P7'  : np.array([-0.07146, -0.07517, -0.00370]), 
               'P3'  : np.array([-0.05507, -0.08011, 0.05944]), 
               'Pz'  : np.array([-0.00087, -0.08223, 0.08243]),
               'P4'  : np.array([0.05351, -0.08013, 0.05940]), 
               'P8'  : np.array([0.07110, -0.07517, -0.00369]), 
               'O1'  : np.array([-0.02898, -0.11452, 0.00967]),  
               'Oz'  : np.array([-0.00141, -0.11779, 0.01584]),
               'O2'  : np.array([0.02689, -0.11468, 0.00945])
              }


# channel groupings (left/mid/right)
l_frontal = ['F3', 'FC3']
m_frontal = ['Fz', 'FCz']
r_frontal = ['F4', 'FC4']
l_central = ['C3', 'CP3']
m_central = ['Cz', 'CPz']
r_central = ['C4', 'CP4']
l_posterior = ['P3', 'O1'] 
m_posterior = ['Pz', 'Oz'] 
r_posterior = ['P4', 'O2'] 
channel_groups = {
    'l_frontal': l_frontal,
    'm_frontal': m_frontal,
    'r_frontal': r_frontal,
    'l_central': l_central,
    'm_central': m_central,
    'r_central': r_central,
    'l_posterior': l_posterior,
    'm_posterior': m_posterior,
    'r_posterior': r_posterior
}

## Create montage
montage = mne.channels.make_dig_montage(ch_pos = dict_ch_pos_array, coord_frame = 'head')

# Create info object for MNE
info = mne.create_info(ch_names=ch_names, ch_types=ch_types, sfreq=500)
info.set_montage(montage=montage, on_missing= 'raise')
print(info)

<Info | 8 non-empty values
 bads: []
 ch_names: Fp1, Fp2, F7, F3, Fz, F4, F8, FC3, FCz, FC4, T7, C3, Cz, C4, T8, ...
 chs: 26 EEG, 5 EOG, 1 ECG, 1 EMG
 custom_ref_applied: False
 dig: 29 items (3 Cardinal, 26 EEG)
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: unspecified
 nchan: 33
 projs: []
 sfreq: 500.0 Hz
>


### Extract functional connectivity features and store as .pkl file per EEG recording

In [5]:
# define all functions for synchrony computation

freq_bands = {
    "delta": [1, 4],
    "theta": [4, 8],
    "alpha": [8, 13],
    "beta":  [13, 30],
    "gamma": [30, 45] # TDBRAIN data could go up to ~100 Hz
    }

band_list = list(freq_bands.keys())

# function to filter signal
def filtered(signal, band, sfreq=500):
  l_freq = freq_bands[band][0]
  h_freq = freq_bands[band][1]
  filtered_signal = filter_data(data=signal, sfreq=sfreq, l_freq=l_freq, h_freq=h_freq, verbose=False, filter_length="auto")
  return filtered_signal

# function to find the difference between two angles (phase synchrony)
def diff_ang(theta1, theta2, full_p=2*np.pi, abso=True):
  half_p = 0.5 * full_p
  fmod1 = np.fmod(theta2 - theta1 + half_p, full_p)
  fmod2 = np.fmod(fmod1 + full_p, full_p) - half_p
  if abso==True:
    return abs(fmod2) #abs(np.fmod(np.fmod(theta2 - theta1 + half_p, full_p) + full_p, full_p) - half_p)
  else:
    return fmod2

# function to hilbert transform and create phase matrix
def hilbert_transform(band_signals, trim=False):
  signal_num = len(band_signals)
  samples = band_signals[0].shape[0]
  #envelope_mat = np.zeros((signal_num, samples))
  phase_mat = np.zeros((signal_num, samples)) # [n_channels, samples]

  for i, filtered_signal in enumerate(band_signals):
    analytic_signal = hilbert(filtered_signal)
    #envelope = np.abs(analytic_signal)
    inst_phase = np.angle(analytic_signal)

    #envelope_mat[i,:] = envelope
    phase_mat[i,:] = inst_phase

  if trim != False:
    #envelope_mat = envelope_mat[:,trim:(samples-trim)]
    phase_mat = phase_mat[:,trim:(samples-trim)]
  
  # print(f'{signal_num = }')
  # print(f'{samples = }')
  # # print(f'{analytic_signal = }')
  # print(f'{analytic_signal.shape = }')
  # # print(f'{inst_phase = }')
  # print(f'{inst_phase.shape = }')
  # print(f'{phase_mat.shape = }')

  return phase_mat #, envelope_mat

# function to calculate synchrony
def calculate_syncro(phase_mat):
  max_diff = np.pi * phase_mat.shape[1]
  size = phase_mat.shape[0]
  syncro_mat = np.zeros((size,size))
  for i, j in combs_without(range(size), 2):
        signal1 = phase_mat[i,:]
        signal2 = phase_mat[j,:]
        value = 1 - (diff_ang(signal1,signal2).sum()/max_diff)
        syncro_mat[i,j] = value
        syncro_mat[j,i] = value
  #eigen = np.diag(np.linalg.eigh(syncro_mat)[1])
  return syncro_mat #, eigen

# function to select the signals of the relevant channels per epoch 
def eeg_pre(epochs, epoch, num_channels=26):
    eeg_signals = []
    for channel in range(num_channels):
      signal = epochs[epoch][channel]
      eeg_signals.append(np.hstack(signal))
    return np.asarray(eeg_signals)

def stc_pre(epoch_data, labels):
    stc_signals = []
    for label in labels:
        if not label.name.startswith("unknown"):#'Background'):
            try:
              label_data = epoch_data.in_label(label).data
              stc_signals.append(label_data.mean(axis=0))
            except:
              pass
    return np.asarray(stc_signals)


In [6]:
from copy import deepcopy

markers_list1 = ["phases_eeg", "phases_stc"]
markers_list2 = ["syncros_eeg", "syncros_stc"]

band_dict = {band:[] for band in deepcopy(band_list)}

subject_dict1 = {marker:deepcopy(band_dict) for marker in deepcopy(markers_list1)}
subject_dict2 = {marker:deepcopy(band_dict) for marker in deepcopy(markers_list2)}

In [7]:
# function to aggregate synchrony values over channel groups
def aggregate_syncro(syncro_mat, channel_groups):
    n_channel_groups = len(channel_groups)
    syncro_agg = np.empty((n_channel_groups, n_channel_groups))
    
    for i, group_channels_i in enumerate(channel_groups.values()):
        for j, group_channels_j in enumerate(channel_groups.values()):
            group_indices_i = [ch_names.index(ch) for ch in group_channels_i]
            group_indices_j = [ch_names.index(ch) for ch in group_channels_j]
            syncro_agg[i, j] = syncro_mat[np.ix_(group_indices_i, group_indices_j)].mean()
            print(group_indices_i)
            print(group_indices_j)
            print(syncro_mat[np.ix_(group_indices_i, group_indices_j)])
            print(syncro_mat[np.ix_(group_indices_i, group_indices_j)].mean())
            print()
    
    np.fill_diagonal(syncro_agg, 0)  # set the diagonal to zero
    
    return syncro_agg

In [8]:
# function for feature extraction pipeline (changed from Federico's code for the TDBRAIN data)
def do_the_math(file_name):
 
    # needs specific info object, because has one less channel
    info = mne.create_info(ch_names=ch_names[:32], ch_types=ch_types[:32], sfreq=500)
    info.set_montage(montage=montage, on_missing= 'raise')

    preprocessed_eeg = np.load(file_name, allow_pickle = True)
    raw = mne.io.RawArray(np.squeeze(preprocessed_eeg['data']), info)

    # epoch the data
    epochs = mne.make_fixed_length_epochs(raw, duration = 9.95, overlap = 0)


    epochs_data = epochs.get_data()[:, :26, :] # select only the EEG channels
    (num_epochs, num_channels, num_samples) = epochs_data.shape

    fname = file_name[file_name.rfind("\\")+1:]
    subject_id = str(fname.split('_')[0])
    if 'EC' in fname:
        condition = "EC"
    if 'EO' in fname:
        condition = "EO"

    output_folder = f"D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\connectivity_features/{subject_id}/"
    os.makedirs(output_folder, exist_ok=True)

    subject_data1 = deepcopy(subject_dict1)
    subject_data2 = deepcopy(subject_dict2)

    # calculate synchrony per band per channel group
    for band in band_list:
        for epoch in range(num_epochs):
            signals_eeg = eeg_pre(epochs_data, epoch, num_channels=26)
            filtered_eeg = filtered(signals_eeg, band)
            phases_eeg = hilbert_transform(filtered_eeg, trim=100)
            syncro_eeg = calculate_syncro(phases_eeg)
            syncro_grouped = aggregate_syncro(syncro_eeg, channel_groups)

            subject_data2["syncros_eeg"][band].append(syncro_grouped)

    # save feature as pickle file
    save_file(subject_data2, output_folder, f"syncro_{subject_id}_{condition}")


In [9]:
# calculate connectivity features for each file with parallel processing
from joblib import Parallel, delayed # parallel processing

# directory containing preprocessed EEG data
eeg_dir = "D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TDBRAIN-dataset-derivatives\derivatives\preprocessed"

sample_ids = df_participants['participants_ID'].tolist() # list of participants to include
# sample_ids = ['sub-87966293', 'sub-87966337'] # for testing


def process_file(filepath):
    do_the_math(filepath)
    return 1

# obtain filepaths to all EEG files
filepaths = []
for subdir, dirs, files in os.walk(eeg_dir): # iterate through all files
    for file in files:
        if any(sample_id in file for sample_id in sample_ids): # filter participants to include
            if 'ses-1' in file and '.npy' in file and 'BAD' not in file: # filter first session, .npy files, and non-bad files
                filepath = os.path.join(subdir, file) # path to eeg file
                filepaths.append(filepath)

results = Parallel(n_jobs=-1)(delayed(process_file)(filepath) for filepath in tqdm(filepaths))

100%|██████████| 1314/1314 [1:28:06<00:00,  4.02s/it]


In [10]:
# load and check a saved file
test_file = load_file(r"D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\connectivity_features\sub-87966293\syncro_sub-87966293_EO.pkl")
test_file['syncros_eeg']['delta']

[array([[0.        , 0.8503579 , 0.78608274, 0.78722656, 0.78208813,
         0.77757915, 0.65855471, 0.62766359, 0.63832266],
        [0.8503579 , 0.        , 0.81392287, 0.75859103, 0.8195816 ,
         0.81499485, 0.65975911, 0.66207855, 0.68053983],
        [0.78608274, 0.81392287, 0.        , 0.69371829, 0.75973158,
         0.78951994, 0.61183203, 0.6215903 , 0.64104392],
        [0.78722656, 0.75859103, 0.69371829, 0.        , 0.82299453,
         0.75899395, 0.79554955, 0.73602415, 0.7160771 ],
        [0.78208813, 0.8195816 , 0.75973158, 0.82299453, 0.        ,
         0.86111742, 0.77604269, 0.79159384, 0.78972517],
        [0.77757915, 0.81499485, 0.78951994, 0.75899395, 0.86111742,
         0.        , 0.72073683, 0.7409888 , 0.77464917],
        [0.65855471, 0.65975911, 0.61183203, 0.79554955, 0.77604269,
         0.72073683, 0.        , 0.79952098, 0.77205017],
        [0.62766359, 0.66207855, 0.6215903 , 0.73602415, 0.79159384,
         0.7409888 , 0.79952098, 0.       

### Create a dataframe containing all connectivity features

In [18]:
# directory with the extracted synchrony features
eeg_dir = r"D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\connectivity_features"
sample_ids = df_participants['participants_ID'].tolist() # list of participants to include
# sample_ids = ['sub-87966293', 'sub-87966337'] # for testing


def process_file(filepath, df_ec_features, df_eo_features):
    fname = filepath[filepath.rfind("\\")+1:]
    subject_id = str(fname.split('_')[1])
    if 'EC' in fname:
        condition = "EC"
    if 'EO' in fname:
        condition = "EO"
    diagnosis = df_participants.loc[df_participants['participants_ID'] == subject_id, 'diagnosis'].values[0]
    # print(subject_id, condition, diagnosis)

    data = load_file(filepath)
    syncro_data = data['syncros_eeg']

    # add data to empty dictionary
    feature_dict = {}
    n_epochs = len(syncro_data[band_list[0]])
    # print(n_epochs)
    feature_dict['ID'] = [subject_id] * n_epochs
    feature_dict['diagnosis'] = [diagnosis] * n_epochs
    feature_dict['epoch'] = list(range(1, n_epochs + 1))

    # create feature list to create column names for dictionary to hold data of epochs 
    feature_name_list = []
    for band in band_list:
        seen_channels = []
        for i, channel_group_i in enumerate(channel_groups):
            seen_channels.append(channel_group_i)
            for j, channel_group_j in enumerate(channel_groups):
                if channel_group_j not in seen_channels and i != j:
                    feature_name_list.append(f'{condition}_{band}_{channel_group_i}-{channel_group_j}')

    for band in syncro_data:
        epoch_dict = {feature:[] for feature in deepcopy(feature_name_list) if band in feature}
        for epoch, syncro_mat in enumerate(syncro_data[band]):
            seen_channels = []
            for i, channel_group_i in enumerate(channel_groups):
                seen_channels.append(channel_group_i)
                for j, channel_group_j in enumerate(channel_groups):
                    if channel_group_j not in seen_channels and i != j:
                        # add data to empty dictionary
                        epoch_dict[f'{condition}_{band}_{channel_group_i}-{channel_group_j}'].append(syncro_mat[i, j])
                        feature_dict.update(epoch_dict)
                   
    feature_dict['ID'] = [subject_id] * len(syncro_data[band])
    feature_dict['diagnosis'] = [diagnosis] * len(syncro_data[band])
    feature_dict['epoch'] = list(range(1, len(syncro_data[band]) + 1))

    # add to dataframe
    if condition == 'EC':
        df_ec_features = pd.concat([df_ec_features, pd.DataFrame(feature_dict)], ignore_index = True)
    if condition == 'EO':
        df_eo_features = pd.concat([df_eo_features, pd.DataFrame(feature_dict)], ignore_index = True)
    
    return df_ec_features, df_eo_features


# counter for progress
count = 1
if count == 1:
    total_files = 0
    for _, dirs, files in os.walk(eeg_dir):
        total_files += len([file for file in files if any(sample_id in file for sample_id in sample_ids) and 'syncro' in file and '.pkl' in file])

df_ec_features = pd.DataFrame() # create empty dataframe to store EC features
df_eo_features = pd.DataFrame() # create empty dataframe to store EO features
filepaths = []
for subdir, dirs, files in os.walk(eeg_dir): # iterate through all files
    for file in files:
        if any(sample_id in file for sample_id in sample_ids): # filter participants to include
            if 'syncro' in file and '.pkl' in file: # filter syncro and .pkl files
                filepath = os.path.join(subdir, file) # path to eeg file
                df_ec_features, df_eo_features = process_file(filepath, df_ec_features, df_eo_features)
                print(f'\rProgress: {count}/{total_files} files processed.', end = '')
                count += 1

df_features = pd.merge(df_eo_features, df_ec_features.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
del df_ec_features, df_eo_features # remove dataframes to free up memory                
print(f'\n{df_features.shape = }')
df_features.sample(5)

Progress: 1314/1314 files processed.
df_features.shape = (7932, 363)


Unnamed: 0,ID,diagnosis,epoch,EO_delta_l_frontal-m_frontal,EO_delta_l_frontal-r_frontal,EO_delta_l_frontal-l_central,EO_delta_l_frontal-m_central,EO_delta_l_frontal-r_central,EO_delta_l_frontal-l_posterior,EO_delta_l_frontal-m_posterior,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
6888,sub-88068525,MDD,1,0.886049,0.876777,0.819304,0.793732,0.758697,0.670228,0.6436,...,0.827219,0.777005,0.785517,0.754674,0.761538,0.791709,0.800391,0.833568,0.787086,0.826723
3347,sub-88034013,ADHD,12,0.83725,0.770517,0.790519,0.778051,0.71568,0.687565,0.675203,...,0.877253,0.811816,0.839027,0.832524,0.772029,0.812954,0.848547,0.849591,0.796317,0.848596
2322,sub-88020425,MDD,7,0.927755,0.887522,0.910294,0.879699,0.855416,0.814851,0.8014,...,0.946014,0.91879,0.932315,0.919845,0.904449,0.924988,0.927298,0.924025,0.908881,0.932037
7564,sub-88074381,ADHD,5,0.874791,0.854618,0.776897,0.809888,0.740861,0.600656,0.62171,...,0.83096,0.770558,0.800546,0.77952,0.748143,0.780312,0.802794,0.829312,0.776209,0.825828
6586,sub-88066145,MDD,11,0.864812,0.828436,0.85311,0.814645,0.803137,0.678018,0.693877,...,0.914688,0.877296,0.874423,0.8185,0.883861,0.895496,0.862871,0.907344,0.841842,0.880591


In [19]:
# create ratio (EC/EO) features
df_ec = df_features.loc[:, df_features.columns.str.contains('EC')]
df_eo = df_features.loc[:, df_features.columns.str.contains('EO')]

column_names = df_eo.columns.to_list()
df_ec.columns = range(df_ec.shape[1])
df_eo.columns = range(df_eo.shape[1])
# calculate ratio of EC/EO for each feature
df_ratio = df_ec / df_eo

# restore columns names but with ratio_ prefix
df_ratio.columns = [column_names[i].replace('EO', 'ratio') for i in range(len(column_names))]
df_ratio.sample(3)

Unnamed: 0,ratio_delta_l_frontal-m_frontal,ratio_delta_l_frontal-r_frontal,ratio_delta_l_frontal-l_central,ratio_delta_l_frontal-m_central,ratio_delta_l_frontal-r_central,ratio_delta_l_frontal-l_posterior,ratio_delta_l_frontal-m_posterior,ratio_delta_l_frontal-r_posterior,ratio_delta_m_frontal-r_frontal,ratio_delta_m_frontal-l_central,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
1295,1.009983,0.994312,1.070046,1.000272,0.983765,0.938668,1.039474,0.942272,1.019072,1.064573,...,1.021784,1.011874,1.013025,1.00502,1.016624,1.006303,0.985483,1.018734,1.010968,1.019104
2220,1.069666,1.109823,1.049057,1.079236,1.129659,1.024509,1.138365,1.145214,1.050427,1.102828,...,0.983942,1.01729,1.005772,0.985359,1.017076,1.003737,0.996025,1.025285,1.017162,1.005865
1696,1.012179,1.028766,0.924881,0.957307,0.880049,0.813857,0.882461,0.830279,1.024306,0.985703,...,1.091446,1.11035,1.094862,1.089223,1.122009,1.102202,1.075561,1.081862,1.104512,1.075967


In [20]:
# merge ratio dataframe with original dataframe
df_features = pd.concat([df_features, df_ratio], axis=1)
df_features.sample(3)

Unnamed: 0,ID,diagnosis,epoch,EO_delta_l_frontal-m_frontal,EO_delta_l_frontal-r_frontal,EO_delta_l_frontal-l_central,EO_delta_l_frontal-m_central,EO_delta_l_frontal-r_central,EO_delta_l_frontal-l_posterior,EO_delta_l_frontal-m_posterior,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
606,sub-87968765,SMC,7,0.858887,0.78402,0.829243,0.778981,0.740918,0.670399,0.644674,...,0.985191,0.994182,0.99506,0.992045,1.008769,1.005667,0.997368,1.001248,1.009325,1.008516
4103,sub-88042749,MDD,12,0.894579,0.852163,0.875532,0.849918,0.810583,0.69084,0.673072,...,0.967061,0.987298,0.984994,0.981328,0.978512,0.990572,0.988017,0.996573,0.992312,0.997355
1847,sub-88012817,MDD,12,0.832346,0.764863,0.800727,0.764846,0.736229,0.556513,0.603084,...,1.064709,1.08477,1.064958,1.057093,1.075019,1.046496,1.048316,1.05239,1.051946,1.028905


In [21]:
df_features.isna().sum() # check for missing values -> decent amount of missing EC data (97 entries = ~8 participants)

ID                                      0
diagnosis                              13
epoch                                   0
EO_delta_l_frontal-m_frontal           13
EO_delta_l_frontal-r_frontal           13
                                       ..
ratio_gamma_r_central-m_posterior      97
ratio_gamma_r_central-r_posterior      97
ratio_gamma_l_posterior-m_posterior    97
ratio_gamma_l_posterior-r_posterior    97
ratio_gamma_m_posterior-r_posterior    97
Length: 543, dtype: int64

In [16]:
# save dataframe to pickle file
df_features.to_pickle('D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features/df_connectivity_features.pkl')

In [17]:
df_features

Unnamed: 0,ID,diagnosis,epoch,EO_delta_l_frontal-m_frontal,EO_delta_l_frontal-r_frontal,EO_delta_l_frontal-l_central,EO_delta_l_frontal-m_central,EO_delta_l_frontal-r_central,EO_delta_l_frontal-l_posterior,EO_delta_l_frontal-m_posterior,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87963725,SMC,1,0.864044,0.757696,0.810825,0.796821,0.736051,0.635325,0.646439,...,0.987047,0.935265,0.958719,0.943470,0.948988,0.981956,0.969441,0.963332,0.943981,0.983951
1,sub-87963725,SMC,2,0.880821,0.825303,0.857770,0.830105,0.778684,0.686311,0.662882,...,0.960409,0.959395,0.964052,0.937388,0.953482,0.973745,0.977959,0.961693,0.940079,0.966568
2,sub-87963725,SMC,3,0.877045,0.827763,0.804888,0.852058,0.794875,0.645527,0.661672,...,1.002437,0.977380,0.969282,0.972632,0.978614,0.974202,0.978073,0.984490,0.975489,0.986522
3,sub-87963725,SMC,4,0.865571,0.786939,0.830247,0.806606,0.750172,0.665883,0.653910,...,0.979541,0.972893,0.988174,0.978970,0.957255,0.970079,0.968738,0.977252,0.970154,0.980370
4,sub-87963725,SMC,5,0.869574,0.810017,0.839588,0.773421,0.743626,0.621132,0.589783,...,1.007364,0.989828,0.994718,0.984907,0.982894,0.984347,0.986222,0.986331,0.970366,0.971274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,sub-88077613,MDD,8,0.806101,0.763588,0.783199,0.724834,0.706524,0.626017,0.630330,...,1.009507,1.000838,1.016128,0.996059,0.993736,1.001347,0.986211,0.979298,0.973552,0.988670
7928,sub-88077613,MDD,9,0.815583,0.775279,0.759034,0.718217,0.696985,0.598557,0.593299,...,1.000415,0.962845,0.977125,0.988844,0.961408,0.968646,0.968562,0.967467,0.977181,0.987045
7929,sub-88077613,MDD,10,0.807190,0.744876,0.776538,0.723549,0.689921,0.653040,0.648972,...,1.009720,1.013700,1.003651,0.983053,1.037019,1.025742,1.001666,1.010475,1.021163,1.011636
7930,sub-88077613,MDD,11,0.819031,0.773774,0.784298,0.725190,0.700155,0.628703,0.632361,...,0.979193,0.964150,0.964483,0.956359,0.961439,0.973681,0.979818,0.972201,0.970626,0.996708
