## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [46]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [47]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_frontal_delta_std,EO_frontal_delta_mean,EO_frontal_delta_median,EO_frontal_delta_skew,EO_frontal_delta_kurt,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
2547,sub-88024205,9,1.0,ADHD,4,0.429345,-0.195915,-0.165459,-0.322072,-0.255433,...,1.238011,2.022976,1.993033,-0.51265,3.674406,0.960203,0.940715,0.872284,1.146036,1.976721
7494,sub-88073653,16,0.0,ADHD,7,0.376239,-0.135077,-0.131342,-0.744752,0.707682,...,0.983775,0.974458,0.954629,0.907215,0.489329,0.891399,0.801495,0.810806,1.096486,1.499386
6713,sub-88067181,52,0.0,MDD,6,0.369337,-0.134282,-0.130082,-0.660858,0.5935,...,1.049634,1.110498,1.053201,0.902322,0.696303,1.088592,1.212608,1.237229,0.694097,0.039914
5472,sub-88054181,35,1.0,MDD,1,0.359549,-0.135266,-0.125392,-0.403166,0.145084,...,1.000406,1.003751,1.113038,0.902451,0.919731,0.882223,0.762041,0.791073,1.338608,2.460332
2497,sub-88023125,44,0.0,MDD,2,0.394956,-0.168565,-0.151822,-0.337039,0.050083,...,1.043815,1.081687,1.0749,1.009908,0.427961,1.040644,1.066895,1.068639,1.08332,0.500884
4212,sub-88043381,39,0.0,MDD,1,0.453635,-0.211363,-0.214382,-0.277574,-0.260813,...,1.075836,1.16286,1.20361,0.8874,0.990292,0.958301,0.932216,0.9553,0.932306,1.160249
2376,sub-88021053,43,0.0,MDD,1,0.380096,-0.148174,-0.14001,-0.441446,0.274359,...,1.011047,1.072232,1.137306,0.618379,1.035648,1.080785,1.190207,1.297263,0.731029,0.934269


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step.

In [48]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [49]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EO') | df_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('EO'))]]

In [50]:
# channel groupings
frontal = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4']
central = ['T7', 'C3', 'Cz', 'C4', 'T8']
parietal = ['CP3','CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8'] 
occipital = ['O1', 'Oz', 'O2']
channel_groups = {'frontal': frontal, 'central': central, 'parietal': parietal, 'occipital': occipital}

# define (5) frequencies of interest for TFR per frequency band
delta = np.array([1, 1.5, 2, 2.5, 3]) # starting at one because of high-pass filter
theta = np.array([4, 4.75, 5.5, 6.25, 7])
alpha = np.array([8, 9, 10, 11, 12])
beta = np.array([13, 17.25, 21.5, 25.75, 30])
gamma = np.array([42, 54, 66, 78, 90]) 
bands = {'delta': delta, 'theta': theta, 'alpha': alpha, 'beta': beta, 'gamma': gamma}

### Ratio features

In [6]:
# define numeric columns
numeric_cols = []
for cond in ['ratio']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [15]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

(2700, 47)


Unnamed: 0,ID,epoch,diagnosis,ratio_frontal_delta_median,ratio_frontal_delta_skew,ratio_central_delta_skew,ratio_parietal_delta_skew,ratio_occipital_delta_skew,ratio_frontal_theta_std,ratio_frontal_theta_mean,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
1256,sub-87980241,9,HEALTHY,1.267147,0.231345,0.064359,0.463078,2.840686,0.948749,0.999695,...,1.017946,1.054241,1.009118,0.852617,0.672806,1.02729,1.064611,1.080077,0.883717,0.721891
6629,sub-88066457,6,OCD,0.87658,1.056424,0.922057,0.476081,0.106165,0.785078,0.532941,...,0.915092,0.927233,0.828066,0.588595,1.785136,0.9312,0.927815,0.868644,1.127093,-82.325195
6904,sub-88068665,5,MDD,0.687538,0.22928,1.57524,1.745218,-27.636263,0.813915,0.822469,...,0.952916,0.933875,0.937722,0.883384,1.046396,1.0796,1.195692,1.386447,0.640032,0.134864
6084,sub-88061193,1,OCD,1.049854,0.63139,0.48289,0.878726,-0.851745,1.07344,1.152974,...,0.904385,0.86442,0.848427,0.769232,0.574813,0.896724,0.840701,0.86798,0.849355,0.802514
577,sub-87968677,2,SMC,0.623139,0.941062,1.185358,0.871997,-0.036575,0.837867,0.713413,...,1.069951,1.132486,1.140683,1.088763,0.770957,1.011923,1.006712,1.012405,1.181259,1.692074


### EC features

In [21]:
# define numeric columns
numeric_cols = []
for cond in ['EC']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=155, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [22]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 47)


Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_occipital_delta_std,EC_frontal_theta_std,EC_frontal_theta_mean,EC_frontal_theta_median,EC_central_theta_std,...,EC_central_gamma_skew,EC_central_gamma_kurt,EC_parietal_gamma_std,EC_parietal_gamma_mean,EC_parietal_gamma_median,EC_parietal_gamma_skew,EC_occipital_gamma_std,EC_occipital_gamma_mean,EC_occipital_gamma_median,EC_occipital_gamma_skew
5417,sub-88053677,6,OCD,0.346186,-0.126356,0.431059,0.371089,-0.153797,-0.151898,0.352847,...,-0.542078,0.142949,0.286944,-0.085715,-0.086391,-0.576318,0.297519,-0.095309,-0.089621,-0.473478
3466,sub-88035229,11,MDD,0.381996,-0.140536,0.373675,0.314289,-0.100397,-0.0912,0.30909,...,-0.638062,0.70589,0.271531,-0.07777,-0.075144,-0.497964,0.280378,-0.085072,-0.090011,-0.353303
981,sub-87971109,10,SMC,0.356517,-0.126531,0.378812,0.286886,-0.089017,-0.082905,0.294076,...,-0.676466,0.721964,0.259949,-0.069742,-0.068549,-0.593979,0.26643,-0.073254,-0.072948,-0.594332
779,sub-87969937,12,SMC,0.333607,-0.123515,0.333874,0.280643,-0.081344,-0.074788,0.275849,...,-0.516769,0.372727,0.27344,-0.077042,-0.071187,-0.571808,0.288559,-0.086514,-0.091021,-0.502072
562,sub-87968541,11,SMC,0.411626,-0.183933,0.359617,0.310404,-0.107739,-0.106258,0.323405,...,-0.55614,0.528888,0.267081,-0.073026,-0.071052,-0.601315,0.265887,-0.071856,-0.070453,-0.659663


### EO features

In [23]:
# define numeric columns
numeric_cols = []
for cond in ['EO']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=180, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [24]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 55)


Unnamed: 0,ID,epoch,diagnosis,EO_frontal_delta_median,EO_frontal_delta_skew,EO_central_delta_std,EO_central_delta_mean,EO_central_delta_median,EO_parietal_delta_std,EO_occipital_delta_skew,...,EO_central_gamma_median,EO_central_gamma_skew,EO_parietal_gamma_std,EO_parietal_gamma_mean,EO_parietal_gamma_median,EO_parietal_gamma_skew,EO_occipital_gamma_std,EO_occipital_gamma_mean,EO_occipital_gamma_median,EO_occipital_gamma_skew
1280,sub-87980373,9,HEALTHY,-0.118508,-0.12716,0.375616,-0.158388,-0.154008,0.367489,-0.333889,...,-0.093893,-0.348711,0.299159,-0.09797,-0.089295,-0.354729,0.293818,-0.096528,-0.092687,-0.253533
4984,sub-88049813,5,MDD,-0.169518,-0.593636,0.422799,-0.180745,-0.17949,0.416829,-0.514643,...,-0.081722,-0.467212,0.268108,-0.074917,-0.074755,-0.508259,0.266573,-0.073085,-0.075445,-0.554656
991,sub-87971197,8,SMC,-0.129451,-0.431434,0.380726,-0.172384,-0.151781,0.365987,-0.503973,...,-0.080907,-0.519339,0.285118,-0.084749,-0.077966,-0.581304,0.269349,-0.075142,-0.075527,-0.498993
6361,sub-88064345,2,ADHD,-0.13301,-0.569376,0.346866,-0.12456,-0.122064,0.32989,-0.93783,...,-0.084261,-0.645176,0.293824,-0.088474,-0.083555,-0.590357,0.304322,-0.091507,-0.086935,-0.748766
648,sub-87968945,1,SMC,-0.130079,-0.448475,0.349376,-0.124305,-0.126467,0.359604,-0.344471,...,-0.083916,-0.600275,0.296474,-0.09108,-0.091095,-0.583274,0.297731,-0.093311,-0.092972,-0.401414


### Merge selected ratio, EC, EO features

In [25]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_occipital_delta_std,EC_frontal_theta_std,EC_frontal_theta_mean,EC_frontal_theta_median,EC_central_theta_std,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
24,sub-87964717,1,SMC,0.388111,-0.158658,0.332231,0.327564,-0.118993,-0.116429,0.339098,...,0.968281,1.022076,1.052374,0.469458,0.126383,1.051424,1.123622,1.156979,0.866754,1.785548
25,sub-87964717,2,SMC,0.404008,-0.166120,0.371482,0.330950,-0.109538,-0.109491,0.321337,...,0.994825,0.998219,1.003191,0.944901,1.384317,1.020286,1.098013,1.087812,0.544720,0.002118
26,sub-87964717,3,SMC,0.424449,-0.191903,0.333949,0.392801,-0.156261,-0.154654,0.363305,...,1.071320,1.153472,1.159519,0.889397,1.053610,1.156190,1.299332,1.286826,1.068119,0.834607
27,sub-87964717,4,SMC,0.455659,-0.224328,0.451149,0.389299,-0.172491,-0.175309,0.375920,...,0.924726,0.751021,0.873962,4.524715,1.097089,0.956484,0.916655,1.080320,0.780077,0.596561
28,sub-87964717,5,SMC,0.366557,-0.142051,0.364635,0.364516,-0.157834,-0.157847,0.346362,...,1.132596,1.413028,1.312625,0.472916,1.821612,1.133601,1.252463,1.128770,1.247167,1.738256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7903,sub-88077525,8,MDD,0.376217,-0.146065,0.363059,0.294275,-0.094149,-0.092135,0.293015,...,0.979383,0.979756,0.968850,0.882507,-2.986603,0.972764,0.956172,0.903145,0.984097,-1.452343
7904,sub-88077525,9,MDD,0.404842,-0.167801,0.341771,0.325576,-0.109965,-0.107863,0.342199,...,1.176680,1.304158,1.240184,1.272699,1.516883,1.187179,1.332789,1.340861,1.232894,1.124988
7905,sub-88077525,10,MDD,0.319906,-0.106238,0.358224,0.273953,-0.075649,-0.075369,0.254059,...,0.952579,0.866925,0.832524,1.484224,2.400569,0.952937,0.872762,0.780062,1.398447,1.328253
7906,sub-88077525,11,MDD,0.348354,-0.127219,0.338369,0.284891,-0.083445,-0.084718,0.275252,...,0.979437,0.962729,0.933522,1.123375,1.001902,0.996582,0.992473,0.909772,1.143214,0.975487


In [30]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. statistical TFR + connectivity features

In [51]:
df_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_connectivity_features.pkl')
df_features = df_features.dropna(subset=['diagnosis'])

In [52]:
# subsample dataset to rebalance dataset
df_subsample = df_conn_features[(df_conn_features['ID'].isin(df_ids_subsample_index))] # use same sampled IDs from earlier for the stat features
df_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [53]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_conn_features_ec = df_subsample[df_conn_features.columns[~(df_subsample.columns.str.startswith('EO') | df_subsample.columns.str.startswith('ratio'))]]
df_conn_features_eo = df_subsample[df_conn_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('ratio'))]]
df_conn_features_ratio = df_subsample[df_conn_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('EO'))]]

In [54]:
# concat connectivity features with stat features
df_stat_conn_features_ec = pd.merge(df_features_ec, df_conn_features_ec.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_eo = pd.merge(df_features_eo, df_conn_features_eo.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_ratio = pd.merge(df_features_ratio, df_conn_features_ratio.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
print(df_stat_conn_features_ec.shape)



(2700, 285)


In [55]:
# channel groupings for stat features
frontal = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4']
central = ['T7', 'C3', 'Cz', 'C4', 'T8']
parietal = ['CP3','CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8'] 
occipital = ['O1', 'Oz', 'O2']
stat_channel_groups = {'frontal': frontal, 'central': central, 'parietal': parietal, 'occipital': occipital}

# channel grouping for connectivity features
l_frontal = ['F3', 'FC3']
m_frontal = ['Fz', 'FCz']
r_frontal = ['F4', 'FC4']
l_central = ['C3', 'CP3']
m_central = ['Cz', 'CPz']
r_central = ['C4', 'CP4']
l_posterior = ['P3', 'O1'] 
m_posterior = ['Pz', 'Oz'] 
r_posterior = ['P4', 'O2'] 
conn_channel_groups = {
    'l_frontal': l_frontal,
    'm_frontal': m_frontal,
    'r_frontal': r_frontal,
    'l_central': l_central,
    'm_central': m_central,
    'r_central': r_central,
    'l_posterior': l_posterior,
    'm_posterior': m_posterior,
    'r_posterior': r_posterior
}

# define (5) frequencies of interest for TFR per frequency band
delta = np.array([1, 1.5, 2, 2.5, 3]) # starting at one because of high-pass filter
theta = np.array([4, 4.75, 5.5, 6.25, 7])
alpha = np.array([8, 9, 10, 11, 12])
beta = np.array([13, 17.25, 21.5, 25.75, 30])
gamma = np.array([42, 54, 66, 78, 90]) 
bands = {'delta': delta, 'theta': theta, 'alpha': alpha, 'beta': beta, 'gamma': gamma}

### Ratio features

In [62]:
# define numeric columns
numeric_cols = []
for cond in ['ratio']:
    for band in bands:
        for group in stat_channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
        seen_channels = []
        for i, channel_group_i in enumerate(conn_channel_groups):
            seen_channels.append(channel_group_i)
            for j, channel_group_j in enumerate(conn_channel_groups):
                if channel_group_j not in seen_channels and i != j:
                    numeric_cols.append(f'{cond}_{band}_{channel_group_i}-{channel_group_j}')
 
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=155, n_jobs=-1, max_depth=15)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [63]:
selected_columns = df_stat_conn_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ratio = df_stat_conn_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ratio.shape)
df_selected_stat_conn_feat_ratio.sample(5)

(2700, 178)


Unnamed: 0,ID,epoch,diagnosis,ratio_delta_l_frontal-r_frontal,ratio_delta_m_frontal-r_frontal,ratio_delta_l_central-l_posterior,ratio_delta_m_central-l_posterior,ratio_delta_m_central-m_posterior,ratio_delta_r_central-r_posterior,ratio_delta_l_posterior-r_posterior,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
2091,sub-88061193,4,OCD,0.936807,1.007517,0.913731,0.833977,0.869239,0.938881,0.924063,...,1.007665,1.004592,1.007389,1.005163,0.997361,0.999399,1.00122,0.995332,0.990821,0.994974
1739,sub-88053453,12,HEALTHY,0.990993,1.016234,0.935675,0.906134,0.919513,1.023417,0.960883,...,0.958221,0.973381,0.997437,0.935843,0.906773,0.936095,0.946066,0.997319,0.934174,0.963542
1695,sub-88052957,4,OCD,0.969492,1.035713,0.968904,0.971043,0.97991,0.961049,0.895968,...,0.946574,0.951197,0.963825,0.936043,0.936778,0.964237,0.953093,0.963303,0.939533,0.955997
1987,sub-88058317,8,OCD,1.025184,1.003443,0.977011,0.946709,0.926543,0.959709,0.950832,...,0.995079,0.948449,0.963225,0.953411,0.96507,0.976479,0.958123,0.957651,0.967578,0.97362
1116,sub-88025597,1,MDD,1.054802,1.103377,0.952221,0.951371,0.971943,0.977876,0.867082,...,1.003287,0.96272,0.983515,0.975868,0.94839,0.983928,1.000402,1.004313,0.965459,0.992726


### EC features

In [64]:
# define numeric columns
numeric_cols = []
for cond in ['EC']:
    for band in bands:
        for group in stat_channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
        seen_channels = []
        for i, channel_group_i in enumerate(conn_channel_groups):
            seen_channels.append(channel_group_i)
            for j, channel_group_j in enumerate(conn_channel_groups):
                if channel_group_j not in seen_channels and i != j:
                    numeric_cols.append(f'{cond}_{band}_{channel_group_i}-{channel_group_j}')
 
X = df_stat_conn_features_ec[numeric_cols]
Y = df_stat_conn_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=195, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [65]:
selected_columns = df_stat_conn_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ec = df_stat_conn_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ec.shape)
df_selected_stat_conn_feat_ec.sample(5)

(2700, 241)


Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_central_delta_median,EC_delta_l_frontal-m_frontal,EC_delta_l_frontal-r_frontal,EC_delta_l_frontal-l_central,EC_delta_l_frontal-m_central,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
133,sub-87967729,2,SMC,0.404874,-0.168297,-0.161132,0.856252,0.817566,0.846501,0.819243,...,0.785433,0.66139,0.698444,0.702413,0.591731,0.662221,0.730246,0.80077,0.696267,0.770664
2142,sub-88062905,7,OCD,0.393735,-0.159315,-0.150952,0.875155,0.80637,0.818751,0.778251,...,0.866197,0.813109,0.81182,0.802727,0.793925,0.818145,0.846789,0.832029,0.815051,0.839484
145,sub-87967773,2,SMC,0.434482,-0.197208,-0.190187,0.83573,0.77577,0.783268,0.748151,...,0.722873,0.685626,0.715186,0.67072,0.639447,0.679144,0.72213,0.736851,0.67525,0.749208
2090,sub-88061193,3,OCD,0.367161,-0.144891,-0.144446,0.8515,0.791241,0.76424,0.763611,...,0.95228,0.938668,0.94293,0.938796,0.937531,0.945582,0.955251,0.95487,0.945588,0.957314
2025,sub-88059081,10,OCD,0.423176,-0.193411,-0.185903,0.871611,0.837045,0.779627,0.791248,...,0.898437,0.855664,0.839967,0.813736,0.837416,0.833909,0.824689,0.868065,0.83026,0.860323


### EO features

In [66]:
# define numeric columns
numeric_cols = []
for cond in ['EO']:
    for band in bands:
        for group in stat_channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
        seen_channels = []
        for i, channel_group_i in enumerate(conn_channel_groups):
            seen_channels.append(channel_group_i)
            for j, channel_group_j in enumerate(conn_channel_groups):
                if channel_group_j not in seen_channels and i != j:
                    numeric_cols.append(f'{cond}_{band}_{channel_group_i}-{channel_group_j}')
 
X = df_stat_conn_features_eo[numeric_cols]
Y = df_stat_conn_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=160, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [67]:
selected_columns = df_stat_conn_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_eo = df_stat_conn_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_eo.shape)
df_selected_stat_conn_feat_eo.sample(5)

(2700, 248)


Unnamed: 0,ID,epoch,diagnosis,EO_frontal_delta_std,EO_frontal_delta_mean,EO_frontal_delta_median,EO_central_delta_std,EO_central_delta_mean,EO_central_delta_median,EO_parietal_delta_std,...,EO_gamma_m_central-r_central,EO_gamma_m_central-l_posterior,EO_gamma_m_central-m_posterior,EO_gamma_m_central-r_posterior,EO_gamma_r_central-l_posterior,EO_gamma_r_central-m_posterior,EO_gamma_r_central-r_posterior,EO_gamma_l_posterior-m_posterior,EO_gamma_l_posterior-r_posterior,EO_gamma_m_posterior-r_posterior
501,sub-87971109,10,SMC,0.411367,-0.171971,-0.165973,0.385675,-0.157073,-0.15178,0.411244,...,0.907237,0.897922,0.915208,0.908017,0.870591,0.894356,0.907419,0.916421,0.904079,0.929728
962,sub-88016105,3,MDD,0.367432,-0.140736,-0.128223,0.323307,-0.106043,-0.106236,0.331775,...,0.7847,0.699798,0.72363,0.725169,0.666483,0.695599,0.757431,0.797121,0.720498,0.761533
2556,sub-88073521,1,ADHD,0.398653,-0.161957,-0.155682,0.381655,-0.147657,-0.146192,0.380085,...,0.900514,0.87904,0.884073,0.870608,0.870628,0.885657,0.897566,0.913673,0.89176,0.904628
1482,sub-88046797,7,MDD,0.35318,-0.132332,-0.122417,0.36373,-0.137671,-0.137451,0.37433,...,0.904008,0.851245,0.856243,0.848682,0.837217,0.849967,0.868722,0.886395,0.860962,0.879869
1069,sub-88023125,2,MDD,0.394956,-0.168565,-0.151822,0.356881,-0.127802,-0.123594,0.357625,...,0.798852,0.694374,0.732594,0.711053,0.703223,0.735353,0.764467,0.803138,0.770837,0.802879


### Merge selected ratio, EC, EO features

In [68]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_stat_conn_feat_eo = df_selected_stat_conn_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_stat_conn_feat_ratio = df_selected_stat_conn_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_stat_conn_features = pd.concat([df_selected_stat_conn_feat_ec, df_selected_stat_conn_feat_eo, df_selected_stat_conn_feat_ratio], axis=1)
df_selected_stat_conn_features

Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_central_delta_median,EC_delta_l_frontal-m_frontal,EC_delta_l_frontal-r_frontal,EC_delta_l_frontal-l_central,EC_delta_l_frontal-m_central,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87964717,1,SMC,0.388111,-0.158658,-0.151116,0.861459,0.812878,0.844137,0.815633,...,0.983375,0.982205,0.973501,0.957588,0.965101,0.975819,0.977929,0.973242,0.955600,0.985688
1,sub-87964717,2,SMC,0.404008,-0.166120,-0.166343,0.914967,0.867276,0.867612,0.856093,...,0.951962,0.999252,0.989583,0.969516,0.930350,0.938956,0.968311,0.982378,0.955147,0.965178
2,sub-87964717,3,SMC,0.424449,-0.191903,-0.190771,0.848407,0.787413,0.820190,0.795381,...,1.007210,1.003786,0.997969,0.981911,1.021233,1.027465,1.020506,0.977939,0.966586,0.985563
3,sub-87964717,4,SMC,0.455659,-0.224328,-0.226784,0.869879,0.810332,0.822063,0.790237,...,0.987823,1.029448,1.023404,1.011507,0.995531,1.025771,1.032580,0.990293,0.982638,1.006213
4,sub-87964717,5,SMC,0.366557,-0.142051,-0.141354,0.872967,0.815916,0.830405,0.840788,...,1.042865,1.021432,1.011774,1.023296,1.010551,1.047893,1.041591,1.019603,1.005809,1.024247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sub-88077525,8,MDD,0.376217,-0.146065,-0.147736,0.903393,0.871700,0.873741,0.868164,...,1.031435,1.044941,1.041568,1.042431,1.046058,1.039692,1.035620,1.032685,1.033959,1.031013
2696,sub-88077525,9,MDD,0.404842,-0.167801,-0.172205,0.875979,0.825309,0.853732,0.854332,...,1.028786,1.038328,1.036858,1.037825,1.042172,1.039622,1.034002,1.030621,1.033953,1.033908
2697,sub-88077525,10,MDD,0.319906,-0.106238,-0.102021,0.893305,0.865345,0.853355,0.861406,...,1.024468,1.031175,1.029130,1.028662,1.033037,1.034674,1.030642,1.026218,1.025883,1.026905
2698,sub-88077525,11,MDD,0.348354,-0.127219,-0.123276,0.891849,0.848841,0.859237,0.849557,...,1.028159,1.039565,1.030890,1.035503,1.040874,1.034199,1.029308,1.028963,1.031723,1.027599


In [69]:
df_selected_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_conn_features.pkl')