### AIM: extract statistical time-frequency features and store in dataframe

[mean, std, median, skew, kurt] of power per channel per frequency band

In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import mne
import os
import matplotlib.pyplot as plt
from mne.time_frequency import tfr_multitaper

%matplotlib inline

# prevent extensive logging
mne.set_log_level('WARNING')

### Loading in participants data
and excluding replication data

In [2]:
df_participants = pd.read_pickle('D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TDBRAIN_participants_V2_data\df_participants.pkl')
print(f'all participants: {df_participants.shape}')
df_participants.sample(5)

all participants: (714, 12)


Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,age,gender,sessID,nrSessions,EC,EO,diagnosis
320,sub-88005849,DISCOVERY,MDD,MDD,MDD-rTMS,42.25,0,1,1,True,True,MDD
1202,sub-88068213,DISCOVERY,MDD,UNKNOWN,,61.35,0,1,1,True,True,MDD
1236,sub-88070825,DISCOVERY,ADHD,ADHD,ADHD_NF,6.4,0,1,1,True,True,ADHD
1015,sub-88051337,DISCOVERY,ADHD,ADHD,ADHD_NF,23.47,1,1,1,True,True,ADHD
323,sub-88005941,DISCOVERY,MDD,MDD,MDD-rTMS,21.02,0,1,1,True,True,MDD


### Creating montage and info object for PSD calculation with MNE

In [3]:
## Set montage based on channel names and locations provided in Van Dijk et al., (2022) (Copied from Anne van Duijvenbode)

ch_types = ['eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg',\
           'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', \
           'eog', 'eog', 'eog', 'eog', 'ecg', 'eog', 'emg']

ch_names = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4', 'T7', 'C3', 'Cz', 'C4', 'T8', 'CP3', \
            'CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'Oz', 'O2', 'VPVA', 'VNVB', 'HPHL', 'HNHR', 'Erbs', \
            'OrbOcc', 'Mass']

dict_eeg_channels =  {ch_names[i]: ch_types[i] for i in range(len(ch_types))}

dict_ch_pos = {'Fp1' : [-26.81, 84.06, -10.56],
               'Fp2' : [29.41, 83.74, -10.04],
               'F7'  : [-66.99, 41.69, -15.96],
               'F3'  : [-48.05, 51.87, 39.87],
               'Fz'  : [0.90, 57.01, 66.36],
               'F4'  : [50.38, 51.84, 41.33],
               'F8'  : [68.71, 41.16, -15.31],
               'FC3' : [-58.83, 21.02, 54.82],
               'FCz' : [0.57, 24.63, 87.63],
               'FC4' : [60.29, 21.16, 55.58], 
               'T7'  : [-83.36, -16.52, -12.65], 
               'C3'  : [-65.57, -13.25, 64.98],
               'Cz'  : [0.23, -11.28, 99.81],
               'C4'  : [66.50, -12.80, 65.11],
               'T8'  : [84.44, -16.65, -11.79], 
               'CP3' : [-65.51, -48.48, 68.57],
               'CPz' : [-0.42, -48.77, 98.37], 
               'CP4' : [65.03, -48.35, 68.57], 
               'P7': [-71.46, -75.17, -3.70], 
               'P3'  : [-55.07, -80.11, 59.44], 
               'Pz'  : [-0.87, -82.23, 82.43],
               'P4'  : [53.51, -80.13, 59.40], 
               'P8' : [71.10, -75.17, -3.69], 
               'O1'  : [-28.98, -114.52, 9.67],  
               'Oz'  : [-1.41, -117.79, 15.84],
               'O2'  : [26.89, -114.68, 9.45]
              }

dict_ch_pos_m = {'Fp1' : [-0.2681, 0.8406, -0.1056],
               'Fp2' : [0.2941, 0.8374, -0.1004],
               'F7'  : [-0.6699, 0.4169, -0.1596],
               'F3'  : [-0.4805, 0.5187, 0.3987],
               'Fz'  : [0.0090, 0.5701, 0.6636],
               'F4'  : [0.5038, 0.5184, 0.4133],
               'F8'  : [0.6871, 0.4116, -0.1531],
               'FC3' : [-0.5883, 0.2102, 0.5482],
               'FCz' : [0.0057, 0.2463, 0.8763],
               'FC4' : [0.6029, 0.2116, 0.5558], 
               'T7'  : [-0.8336, -0.1652, -0.1265], 
               'C3'  : [-0.6557, -0.1325, 0.6498],
               'Cz'  : [0.0023, -0.1128, 0.9981],
               'C4'  : [0.6650, -0.1280, 0.6511],
               'T8'  : [0.8444, -0.1665, -0.1179], 
               'CP3' : [-0.6551, -0.4848, 0.6857],
               'CPz' : [-0.042, -0.4877, 0.9837], 
               'CP4' : [0.6503, -0.4835, 0.6857], 
               'P7'  : [-0.7146, -0.7517, -0.0370], 
               'P3'  : [-0.5507, -0.8011, 0.5944], 
               'Pz'  : [-0.0087, -0.8223, 0.8243],
               'P4'  : [0.5351, -0.8013, 0.5940], 
               'P8'  : [0.7110, -0.7517, -0.0369], 
               'O1'  : [-0.2898, -1.1452, 0.0967],  
               'Oz'  : [-0.0141, -1.1779, 0.1584],
               'O2'  : [0.2689, -1.1468, 0.0945]
              }

dict_ch_pos_array = {'Fp1' : np.array([-0.02681, 0.08406, -0.01056]),
               'Fp2' : np.array([0.02941, 0.08374, -0.01004]),
               'F7'  : np.array([-0.06699, 0.04169, -0.01596]),
               'F3'  : np.array([-0.04805, 0.05187, 0.03987]),
               'Fz'  : np.array([0.00090, 0.05701, 0.06636]),
               'F4'  : np.array([0.05038, 0.05184, 0.04133]),
               'F8'  : np.array([0.06871, 0.04116, -0.01531]),
               'FC3' : np.array([-0.05883, 0.02102, 0.05482]),
               'FCz' : np.array([0.00057, 0.02463, 0.08763]),
               'FC4' : np.array([0.06029, 0.02116, 0.05558]), 
               'T7'  : np.array([-0.08336, -0.01652, -0.01265]), 
               'C3'  : np.array([-0.06557, -0.01325, 0.06498]),
               'Cz'  : np.array([0.000023, -0.01128, 0.09981]),
               'C4'  : np.array([0.06650, -0.01280, 0.06511]),
               'T8'  : np.array([0.08444, -0.01665, -0.01179]), 
               'CP3' : np.array([-0.06551, -0.04848, 0.06857]),
               'CPz' : np.array([-0.0042, -0.04877, 0.09837]), 
               'CP4' : np.array([0.06503, -0.04835, 0.06857]), 
               'P7'  : np.array([-0.07146, -0.07517, -0.00370]), 
               'P3'  : np.array([-0.05507, -0.08011, 0.05944]), 
               'Pz'  : np.array([-0.00087, -0.08223, 0.08243]),
               'P4'  : np.array([0.05351, -0.08013, 0.05940]), 
               'P8'  : np.array([0.07110, -0.07517, -0.00369]), 
               'O1'  : np.array([-0.02898, -0.11452, 0.00967]),  
               'Oz'  : np.array([-0.00141, -0.11779, 0.01584]),
               'O2'  : np.array([0.02689, -0.11468, 0.00945])
              }

# channel groupings (left/mid/right)
l_frontal = ['F3', 'FC3']
m_frontal = ['Fz', 'FCz']
r_frontal = ['F4', 'FC4']
l_central = ['C3', 'CP3']
m_central = ['Cz', 'CPz']
r_central = ['C4', 'CP4']
l_posterior = ['P3', 'O1'] 
m_posterior = ['Pz', 'Oz'] 
r_posterior = ['P4', 'O2'] 
channel_groups = {
    'l_frontal': l_frontal,
    'm_frontal': m_frontal,
    'r_frontal': r_frontal,
    'l_central': l_central,
    'm_central': m_central,
    'r_central': r_central,
    'l_posterior': l_posterior,
    'm_posterior': m_posterior,
    'r_posterior': r_posterior
}

# define (5) frequencies of interest for TFR per frequency band
delta = np.array([1, 1.5, 2, 2.5, 3]) # starting at one because of high-pass filter
theta = np.array([4, 4.75, 5.5, 6.25, 7])
alpha = np.array([8, 9, 10, 11, 12])
beta = np.array([13, 17.25, 21.5, 25.75, 30])
gamma = np.array([42, 54, 66, 78, 90]) 
bands = {'delta': delta, 'theta': theta, 'alpha': alpha, 'beta': beta, 'gamma': gamma}


## Create montage
montage = mne.channels.make_dig_montage(ch_pos = dict_ch_pos_array, coord_frame = 'head')

# Create info object for MNE
info = mne.create_info(ch_names=ch_names, ch_types=ch_types, sfreq=500)
info.set_montage(montage=montage, on_missing= 'raise')
print(info)

<Info | 8 non-empty values
 bads: []
 ch_names: Fp1, Fp2, F7, F3, Fz, F4, F8, FC3, FCz, FC4, T7, C3, Cz, C4, T8, ...
 chs: 26 EEG, 5 EOG, 1 ECG, 1 EMG
 custom_ref_applied: False
 dig: 29 items (3 Cardinal, 26 EEG)
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: unspecified
 nchan: 33
 projs: []
 sfreq: 500.0 Hz
>


### Feature extraction and storing in df

In [11]:
import scipy
# calculate variance in power per freq band and per channel group for each file and store in dataframe
eeg_dir = "D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TDBRAIN-dataset-derivatives\derivatives\preprocessed"

#exlude_dirs = ['preprocessed', 'results_manuscript', 'adhd_sample'] # exclude these directories
sample_ids = df_participants['participants_ID'].tolist() # list of participants to include
#sample_ids = ['sub-87966293', 'sub-87966337']

df_ec_features = pd.DataFrame() # create empty dataframe to store EC features
df_eo_features = pd.DataFrame() # create empty dataframe to store EO features

# counter for progress
count = 1
if count == 1:
    total_files = 0
    for _, dirs, files in os.walk(eeg_dir):
        #dirs[:] = [d for d in dirs if d not in exlude_dirs] # exclude directories
        total_files += len([file for file in files if any(sample_id in file for sample_id in sample_ids) and '.npy' in file and 'ses-1' in file and 'BAD' not in file])

for subdir, dirs, files in os.walk(eeg_dir): # iterate through all files
    #dirs[:] = [d for d in dirs if d not in exlude_dirs] # exclude directories
    for file in files:
        if any(sample_id in file for sample_id in sample_ids): # filter participants to include
            if 'ses-1' in file and '.npy' in file and 'BAD' not in file: # filter first session, .npy files, and non-bad files
                filepath = os.path.join(subdir, file) # path to eeg file

                # needs specific info object, because has one less channel
                info = mne.create_info(ch_names=ch_names[:32], ch_types=ch_types[:32], sfreq=500)
                info.set_montage(montage=montage, on_missing= 'raise')

                preprocessed_eeg = np.load(filepath, allow_pickle = True)
                raw = mne.io.RawArray(np.squeeze(preprocessed_eeg['data']), info)

                # epoch the data
                epochs = mne.make_fixed_length_epochs(raw, duration = 9.95, overlap = 0)

                if 'EC' in file:
                    cond = 'EC'
                if 'EO' in file:
                    cond = 'EO'

                # determine age, gender, and diagnosis of participant corresponding to file
                age = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'age'].values[0]
                gender = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'gender'].values[0]
                diagnosis = df_participants.loc[df_participants['participants_ID'] == file.split('_')[0], 'diagnosis'].values[0]
                

                # add data to empty dictionary
                feature_dict = {}
                feature_dict['ID'] = [file.split('_')[0]] * epochs.get_data().shape[0]
                feature_dict['age'] = [age] * epochs.get_data().shape[0]
                feature_dict['gender'] = [gender] * epochs.get_data().shape[0]
                feature_dict['diagnosis'] = [diagnosis] * epochs.get_data().shape[0]
                #feature_dict['EO/EC'] = [cond] * epochs.get_data().shape[0]
                feature_dict['epoch'] = list(range(1, epochs.get_data().shape[0] + 1))
                
                # calculate TFR per band and channel group using multitaper convolution
                for band in bands:
                    for group in channel_groups:
                        tfr_mt = tfr_multitaper(
                            epochs,
                            freqs=bands[band],
                            n_cycles=(bands[band] / 2),
                            time_bandwidth=4, # just random for now
                            use_fft=True,
                            return_itc=False,
                            average=False,
                            decim=8, # decim reduces sampling rate of the tf decomposition by the defined factor
                            n_jobs=-1,
                            picks=channel_groups[group])
                        tfr_mt.apply_baseline((None, None), mode='logratio') # baseline correction, (None, None) = whole epoch

                        # compute average spectral spower per epoch per channel group
                        tfr_mt_std = np.std(tfr_mt.data, axis = 3) # calculate stdev over time
                        tfr_mt_std_mean = np.mean(tfr_mt_std, axis = (1, 2)) # calculate mean of stdev over channels and frequencies

                        # compute average spectral power per epoch per channel group
                        tfr_mt_mean = np.mean(tfr_mt.data, axis = 3)
                        tfr_mt_mean_mean = np.mean(tfr_mt_mean, axis = (1, 2))

                        # compute median spectral power per epoch per channel group
                        tfr_mt_mean_median = np.median(tfr_mt_mean, axis = (1, 2))

                        # compute skewness of spectral power per epoch per channel group
                        tfr_mt_skew = scipy.stats.skew(tfr_mt.data, axis = 3)
                        tfr_mt_skew_mean = np.mean(tfr_mt_skew, axis = (1, 2))

                        # compute kurtosis of spectral power per epoch per channel group
                        tfr_mt_kurt = scipy.stats.kurtosis(tfr_mt.data, axis = 3)
                        tfr_mt_kurt_mean = np.mean(tfr_mt_kurt, axis = (1, 2))


                        # add to dictionary
                        feature_dict[f'{cond}_{group}_{band}_std'] = tfr_mt_std_mean
                        feature_dict[f'{cond}_{group}_{band}_mean'] = tfr_mt_mean_mean
                        feature_dict[f'{cond}_{group}_{band}_median'] = tfr_mt_mean_median
                        feature_dict[f'{cond}_{group}_{band}_skew'] = tfr_mt_skew_mean
                        feature_dict[f'{cond}_{group}_{band}_kurt'] = tfr_mt_kurt_mean
                
                # add to dataframe
                if cond == 'EC':
                    df_ec_features = pd.concat([df_ec_features, pd.DataFrame(feature_dict)], ignore_index = True)
                if cond == 'EO':
                    df_eo_features = pd.concat([df_eo_features, pd.DataFrame(feature_dict)], ignore_index = True)

                print(f'\rProgress: {count}/{total_files} files processed.', end = '')
                count += 1

# merge EO and EC dataframes
print('\n')
print('Missing diagnoses in EO instances:', df_eo_features['diagnosis'].isnull().sum())
print('Missing diagnoses in EC instances:', df_ec_features['diagnosis'].isnull().sum())
df_features = pd.merge(df_eo_features, df_ec_features.drop(columns=['age', 'gender']),  how='outer', on=['ID', 'epoch', 'diagnosis'])
del df_ec_features, df_eo_features # remove dataframes to free up memory                
print(f'\n{df_features.shape = }')
df_features.sample(3)

Progress: 1314/1314 files processed.

Missing diagnoses in EO instances: 0
Missing diagnoses in EC instances: 0

df_features.shape = (7932, 455)


Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,EC_m_posterior_gamma_std,EC_m_posterior_gamma_mean,EC_m_posterior_gamma_median,EC_m_posterior_gamma_skew,EC_m_posterior_gamma_kurt,EC_r_posterior_gamma_std,EC_r_posterior_gamma_mean,EC_r_posterior_gamma_median,EC_r_posterior_gamma_skew,EC_r_posterior_gamma_kurt
6529,sub-88065381,23.19,0.0,OCD,2,0.41206,-0.161034,-0.1549,-0.621431,0.036863,...,0.261856,-0.072332,-0.069313,-0.558995,0.395631,0.282359,-0.081828,-0.082172,-0.715479,0.93491
3517,sub-88035677,38.0,1.0,MDD,2,0.412318,-0.162964,-0.163541,-0.546545,-0.045449,...,0.278461,-0.07804,-0.075883,-0.751975,1.543519,0.295808,-0.088065,-0.084556,-0.671414,0.792248
912,sub-87970749,56.53,1.0,SMC,1,0.373868,-0.149438,-0.143552,-0.358079,0.443684,...,0.279414,-0.078648,-0.086065,-0.6555,0.252153,0.280238,-0.078568,-0.074939,-0.710396,0.519011


In [12]:
df_features.isnull().sum()

ID                               0
age                            169
gender                          13
diagnosis                        0
epoch                            0
                              ... 
EC_r_posterior_gamma_std        84
EC_r_posterior_gamma_mean       84
EC_r_posterior_gamma_median     84
EC_r_posterior_gamma_skew       84
EC_r_posterior_gamma_kurt       84
Length: 455, dtype: int64

In [14]:
df_ec = df_features.loc[:, df_features.columns.str.contains('EC')]
df_eo = df_features.loc[:, df_features.columns.str.contains('EO')]

column_names = df_eo.columns.to_list()
df_ec.columns = range(df_ec.shape[1])
df_eo.columns = range(df_eo.shape[1])
# calculate ratio of EC/EO for each feature
df_ratio = df_ec / df_eo

# restore columns names but with ratio_ prefix
df_ratio.columns = [column_names[i].replace('EO', 'ratio') for i in range(len(column_names))]
df_ratio.sample(3)

Unnamed: 0,ratio_l_frontal_delta_std,ratio_l_frontal_delta_mean,ratio_l_frontal_delta_median,ratio_l_frontal_delta_skew,ratio_l_frontal_delta_kurt,ratio_m_frontal_delta_std,ratio_m_frontal_delta_mean,ratio_m_frontal_delta_median,ratio_m_frontal_delta_skew,ratio_m_frontal_delta_kurt,...,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
4290,0.891319,0.715177,0.700151,-18.103464,-0.679635,0.765715,0.548037,0.517057,2.785469,-37.660353,...,1.11049,1.264985,1.292846,0.445598,-0.804412,1.139452,1.30458,1.383466,0.711359,1.076782
6605,0.888984,0.800177,0.806742,0.877413,-6.776012,0.828664,0.728493,0.714599,0.806866,1.665248,...,0.741121,0.4903,0.471475,-28.730078,3.016718,0.701326,0.424257,0.453316,-5.424469,2.313792
4049,1.03492,0.898822,0.908151,-138.977504,0.486483,0.970569,0.945152,0.914208,1.228003,-1.211415,...,0.997826,0.912939,0.941532,2.324478,1.992712,0.974553,0.893993,0.965935,1.932253,3.807739


In [15]:
# merge ratio dataframe with original dataframe
df_features = pd.concat([df_features, df_ratio], axis=1)
df_features.sample(3)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
6311,sub-88063445,43.79,0.0,MDD,12,0.405664,-0.169827,-0.170169,-0.365227,-0.093512,...,0.931536,0.919634,0.9852,0.644492,0.773366,1.062182,1.129652,1.106457,1.013762,1.141452
5479,sub-88054181,35.2,1.0,MDD,8,0.350697,-0.127783,-0.12683,-0.521858,0.310858,...,1.057101,1.1122,1.111444,1.004654,1.218119,0.957633,0.983033,0.949642,0.570005,1.065209
1646,sub-88007241,8.53,1.0,ADHD,3,0.361048,-0.144681,-0.137077,-0.200389,0.373995,...,1.049334,1.110973,1.159397,0.748574,0.075706,1.079553,1.170501,1.138646,0.786789,-0.461633


In [16]:
df_features.isna().sum() # check for missing values -> decent amount of missing EC data (97 entries = ~8 participants)

ID                                  0
age                               169
gender                             13
diagnosis                           0
epoch                               0
                                 ... 
ratio_r_posterior_gamma_std        97
ratio_r_posterior_gamma_mean       97
ratio_r_posterior_gamma_median     97
ratio_r_posterior_gamma_skew       97
ratio_r_posterior_gamma_kurt       97
Length: 680, dtype: int64

In [17]:
df_features.to_pickle('D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features/df_stat_features.pkl')