## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [9]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle
import os

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [10]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
6593,sub-88066281,32,1.0,MDD,6,0.354021,-0.133072,-0.131381,-0.360056,0.340202,...,0.779333,0.545003,0.603597,4.429617,-4.061728,0.806558,0.584039,0.657993,5.42182,-5.600346
69,sub-87966337,58,1.0,SMC,10,0.344499,-0.116322,-0.116897,-0.642301,0.242954,...,1.146899,1.627117,1.575791,0.17079,0.47241,1.450624,3.012854,2.445614,-0.592662,1.365462
1796,sub-88011333,24,1.0,MDD,9,0.36698,-0.123264,-0.123401,-1.027417,2.147161,...,1.002552,1.06424,1.039252,0.599186,0.467584,1.086627,1.240398,1.253282,0.647121,1.122853
7348,sub-88072573,71,1.0,MDD,5,0.352858,-0.123107,-0.119466,-0.555479,-0.183167,...,0.904705,0.834675,0.825135,0.907947,0.398367,0.900629,0.821307,0.809937,1.01804,0.669341
6146,sub-88061829,48,1.0,ADHD,3,0.34165,-0.130367,-0.134802,-0.099623,-0.235092,...,0.934062,0.869845,0.880608,1.234495,9.05931,1.062249,1.07387,1.096663,1.534252,1.131127
3743,sub-88039193,7,1.0,ADHD,12,0.381017,-0.153673,-0.149083,-0.332437,-0.04592,...,0.99082,1.001293,1.046003,0.863973,0.666824,0.922756,0.878266,0.894611,1.058827,3.876068
323,sub-87967421,55,0.0,SMC,12,0.388022,-0.142892,-0.145524,-0.849505,1.310415,...,1.152866,1.35065,1.353014,0.756992,1.429903,1.201116,1.450808,1.323888,0.962447,1.141752


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step. Additionally, for a still unknown reason, during feature extraction some participants data seem to be lossed, possibly due to the marked 'BAD' files from preprocessing.

In [11]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_stat_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_stat_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [12]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EO') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('EO'))]]

### Ratio features

In [13]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=190, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [14]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

(2700, 43)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_delta_skew,ratio_r_central_delta_std,ratio_r_central_theta_std,ratio_r_central_theta_mean,ratio_r_central_theta_median,ratio_m_posterior_theta_std,ratio_m_posterior_theta_skew,...,ratio_l_posterior_gamma_skew,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew
1037,sub-87974617,6,HEALTHY,1.233971,1.118929,1.053579,1.168728,1.108943,1.12062,0.578223,...,2.121189,0.996538,0.986533,0.98768,1.130715,2.352754,1.00493,0.98006,1.00094,1.438444
5844,sub-88058633,1,HEALTHY,0.783002,0.907015,1.084074,1.100861,1.027637,0.882629,1.478581,...,1.231936,1.139191,1.244482,1.299983,1.066445,0.811547,0.997713,0.984748,0.804553,1.413358
356,sub-87967729,9,SMC,-0.43679,1.255392,1.526506,2.469541,2.362854,1.336713,-0.036846,...,0.794741,0.962432,0.901836,0.888977,1.206662,0.848008,0.993297,0.939794,0.909836,1.464227
483,sub-87968229,4,SMC,1.429541,1.320043,1.236285,1.415253,1.309029,0.992015,1.43582,...,0.575416,0.927624,0.823107,1.00355,1.399364,3.316476,0.977974,0.905741,0.905537,1.856935
2533,sub-88023529,2,MDD,2.995655,1.018076,1.264047,1.54866,1.605007,1.225934,5.235474,...,1.002283,1.131391,1.238534,1.149607,1.320463,8.917814,1.020217,1.085734,1.102653,0.719726


### EC features

In [15]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ec.columns if 'EC' in num_col]
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=170, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [16]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 62)


Unnamed: 0,ID,epoch,diagnosis,EC_l_posterior_delta_std,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,EC_r_frontal_theta_std,EC_r_frontal_theta_mean,...,EC_l_posterior_gamma_median,EC_l_posterior_gamma_skew,EC_m_posterior_gamma_std,EC_m_posterior_gamma_mean,EC_m_posterior_gamma_median,EC_m_posterior_gamma_skew,EC_r_posterior_gamma_std,EC_r_posterior_gamma_mean,EC_r_posterior_gamma_median,EC_r_posterior_gamma_skew
118,sub-87966473,11,SMC,0.361,0.278725,-0.0824,-0.082837,0.291125,0.292709,-0.091547,...,-0.074426,-0.434469,0.253251,-0.070034,-0.063653,-0.326626,0.253837,-0.069206,-0.069004,-0.439956
3829,sub-88040317,2,ADHD,0.38599,0.315141,-0.11418,-0.110867,0.307256,0.308327,-0.105264,...,-0.07663,-0.471768,0.290603,-0.08306,-0.079624,-0.820789,0.265776,-0.073279,-0.071196,-0.584701
7036,sub-88069649,5,MDD,0.315802,0.32426,-0.118766,-0.119059,0.338317,0.326811,-0.12022,...,-0.088418,-0.551703,0.293113,-0.089011,-0.086774,-0.589305,0.291287,-0.088403,-0.086351,-0.555309
1102,sub-87974973,11,HEALTHY,0.332874,0.281144,-0.091666,-0.084212,0.284847,0.318866,-0.117089,...,-0.150527,0.23538,0.319715,-0.125508,-0.102245,-0.050409,0.3239,-0.124802,-0.104108,-0.162607
5484,sub-88054225,1,OCD,0.399757,0.318368,-0.0992,-0.09761,0.296773,0.277723,-0.090811,...,-0.075922,-0.437814,0.277985,-0.077287,-0.076054,-0.714949,0.265993,-0.07469,-0.076316,-0.503855


### EO features

In [17]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_eo.columns if 'EO' in num_col]
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=170, n_jobs=-1, max_depth=5)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [18]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 102)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_r_frontal_delta_mean,EO_m_central_delta_mean,EO_r_central_delta_std,EO_r_central_delta_mean,EO_r_central_delta_median,...,EO_m_posterior_gamma_std,EO_m_posterior_gamma_mean,EO_m_posterior_gamma_median,EO_m_posterior_gamma_skew,EO_m_posterior_gamma_kurt,EO_r_posterior_gamma_std,EO_r_posterior_gamma_mean,EO_r_posterior_gamma_median,EO_r_posterior_gamma_skew,EO_r_posterior_gamma_kurt
6933,sub-88068841,10,HEALTHY,-0.197953,-0.232924,-0.12544,-0.111258,0.390823,-0.144331,-0.146203,...,0.287532,-0.086424,-0.075811,-0.539149,0.352333,0.289671,-0.08778,-0.087915,-0.500158,0.252589
666,sub-87969125,7,SMC,0.032951,0.34045,-0.153825,-0.271177,0.402081,-0.190609,-0.183459,...,0.272821,-0.076283,-0.075875,-0.603762,0.522994,0.283451,-0.080963,-0.082509,-0.754849,1.7232
857,sub-87970345,6,SMC,-0.342508,0.367209,-0.128005,-0.139156,0.399094,-0.151797,-0.153386,...,0.265838,-0.074925,-0.077636,-0.448801,0.421761,0.265867,-0.072464,-0.072326,-0.618791,0.566448
1199,sub-87976773,12,HEALTHY,-0.687388,1.291325,-0.127022,-0.120564,0.375112,-0.13572,-0.137548,...,0.302442,-0.090793,-0.092133,-0.734105,0.664549,0.271916,-0.077775,-0.075948,-0.501906,0.473327
2977,sub-88028661,2,ADHD,-0.547852,0.290638,-0.134493,-0.123947,0.413656,-0.16271,-0.164255,...,0.309155,-0.108622,-0.10641,-0.120743,0.043035,0.336005,-0.131397,-0.120089,-0.149792,0.313014


### Merge selected ratio, EC, EO features

In [19]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

Unnamed: 0,ID,epoch,diagnosis,EC_l_posterior_delta_std,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,EC_r_frontal_theta_std,EC_r_frontal_theta_mean,...,ratio_l_posterior_gamma_skew,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew
24,sub-87964717,1,SMC,0.360979,0.296711,-0.103983,-0.104952,0.316582,0.314024,-0.109632,...,0.522167,1.009346,1.078347,1.028052,0.603866,0.159593,1.020543,1.089969,1.206395,0.741274
25,sub-87964717,2,SMC,0.396142,0.320765,-0.101048,-0.091661,0.319229,0.345249,-0.117249,...,0.720480,0.983948,1.022707,1.084144,0.575195,0.079641,1.032616,1.096335,1.123558,0.756438
26,sub-87964717,3,SMC,0.360976,0.383631,-0.142807,-0.139421,0.369464,0.404183,-0.166141,...,0.956565,1.130790,1.256831,1.235856,0.971040,0.630136,1.125305,1.261261,1.274927,0.898000
27,sub-87964717,4,SMC,0.458912,0.400738,-0.190151,-0.198076,0.349989,0.383961,-0.168866,...,1.331098,0.972647,0.916448,1.059041,1.158384,0.639208,1.000264,0.949167,0.989528,1.394617
28,sub-87964717,5,SMC,0.336692,0.344149,-0.130853,-0.132630,0.345981,0.385940,-0.167488,...,0.835716,1.175073,1.492219,1.438212,0.648777,1.667125,1.144547,1.378300,1.134033,0.823934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,sub-88076717,8,OCD,0.401744,0.398381,-0.153613,-0.153747,0.385733,0.396440,-0.167367,...,1.016841,0.950693,0.876182,0.853135,1.312748,2.115164,0.976949,0.895213,1.004927,1.681671
7808,sub-88076717,9,OCD,0.372481,0.280391,-0.083200,-0.082427,0.269471,0.308572,-0.107010,...,1.180789,1.045766,1.099941,1.193314,0.852539,0.680275,1.058189,1.086473,1.128326,1.366124
7809,sub-88076717,10,OCD,0.464772,0.394338,-0.156866,-0.157289,0.364809,0.362061,-0.152433,...,0.856885,0.983628,0.994318,1.048994,0.816864,0.843761,0.975462,0.960748,0.960796,1.100767
7810,sub-88076717,11,OCD,0.416722,0.321075,-0.114733,-0.113885,0.355278,0.385819,-0.157592,...,2.178256,1.018119,1.014035,1.015445,1.066978,0.687766,1.006964,1.011169,0.984383,1.155833


In [20]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. statistical TFR + connectivity features

In [21]:
df_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_connectivity_features.pkl')
df_features = df_features.dropna(subset=['diagnosis'])

In [22]:
# subsample dataset to rebalance dataset
df_conn_subsample = df_conn_features[(df_conn_features['ID'].isin(df_ids_subsample_index))] # use same sampled IDs from earlier for the stat features
df_conn_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [23]:
# merge stat and conn features and store for later training of GCNs
df_stat_conn_features = pd.merge(df_stat_subsample, df_conn_subsample.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
# drop age and gender columns
df_stat_conn_features = df_stat_conn_features.drop(columns=['age', 'gender'])
df_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_conn_features.pkl')
print(df_stat_conn_features.shape)
df_stat_conn_features.sample(3)

(2700, 1218)


Unnamed: 0,ID,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_m_frontal_delta_std,EO_m_frontal_delta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
85,sub-87967061,SMC,2,0.526946,-0.32149,-0.329293,0.138598,-0.040187,0.616136,-0.543819,...,1.051912,1.021816,1.007011,1.00089,1.077221,1.056034,1.042243,1.03475,1.038238,1.013438
569,sub-87974665,HEALTHY,6,0.337011,-0.108363,-0.111538,-0.824303,0.685945,0.333057,-0.108726,...,0.9912,1.007697,0.985133,0.97913,1.050544,1.046434,1.034819,1.031872,1.076038,1.059462
1434,sub-88042661,ADHD,7,0.38442,-0.153788,-0.142895,-0.302585,-0.351258,0.38206,-0.149558,...,1.022694,1.095642,1.085537,1.088894,1.040364,1.024484,1.028787,1.060162,1.074329,1.057202


In [24]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_conn_features_ec = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EO') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_eo = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_ratio = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('EO'))]]

In [25]:
# concat connectivity features with stat features
df_stat_conn_features_ec = pd.merge(df_features_ec, df_conn_features_ec.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_eo = pd.merge(df_features_eo, df_conn_features_eo.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_ratio = pd.merge(df_features_ratio, df_conn_features_ratio.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
print(df_stat_conn_features_ec.shape)



(2700, 410)


### Ratio features

In [26]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ratio.columns if 'ratio' in num_col]
 
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=15)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [27]:
selected_columns = df_stat_conn_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ratio = df_stat_conn_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ratio.shape)
df_selected_stat_conn_feat_ratio.sample(5)

(2700, 188)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_delta_skew,ratio_r_central_theta_std,ratio_r_central_theta_median,ratio_m_posterior_theta_std,ratio_m_posterior_theta_skew,ratio_r_posterior_theta_std,ratio_r_posterior_theta_median,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
2584,sub-88073797,5,MDD,1.734636,0.845964,0.701163,0.926052,1.407399,0.958076,0.881047,...,1.068738,1.005128,1.048565,1.049145,1.032499,1.061583,1.043382,1.001597,0.992954,1.014573
1618,sub-88048413,11,OCD,1.764382,1.141451,1.505451,1.146409,0.680985,1.174552,1.433504,...,1.101121,1.089355,1.066655,1.08946,1.178309,1.147828,1.11785,1.040817,1.090683,1.072821
1697,sub-88051073,6,HEALTHY,-0.923252,0.800433,0.684456,0.817995,-3.058665,0.787956,0.692607,...,1.112452,1.01436,1.030111,1.022473,1.057986,1.079725,1.030821,1.056365,1.019626,1.049995
2433,sub-88068797,10,ADHD,2.051265,0.874699,0.836828,0.998416,0.352311,1.060356,1.101439,...,1.039875,1.094727,1.090382,1.096736,1.083691,1.053736,1.053583,1.060015,1.069017,1.061576
470,sub-87970969,3,SMC,-0.216021,0.929585,0.672073,0.755853,-15.893336,0.824979,0.576558,...,0.997913,0.990705,1.003559,1.008891,0.945486,0.988112,0.996087,0.996193,0.969387,0.996419


### EC features

In [28]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ec.columns if 'EC' in num_col]
 
X = df_stat_conn_features_ec[numeric_cols]
Y = df_stat_conn_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=185, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [29]:
selected_columns = df_stat_conn_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ec = df_stat_conn_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ec.shape)
df_selected_stat_conn_feat_ec.sample(5)

(2700, 275)


Unnamed: 0,ID,epoch,diagnosis,EC_r_central_delta_std,EC_r_central_delta_mean,EC_r_central_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
1899,sub-88054937,4,OCD,0.332098,-0.122867,-0.113763,0.328039,-0.111721,-0.108884,0.322662,...,0.729098,0.719448,0.734877,0.7091,0.672243,0.699778,0.726153,0.778882,0.725873,0.771979
793,sub-87980417,2,HEALTHY,0.355726,-0.131721,-0.12841,0.367791,-0.157296,-0.160873,0.381967,...,0.796595,0.761514,0.750052,0.743671,0.719313,0.736035,0.772624,0.783369,0.749037,0.765651
512,sub-87971197,9,SMC,0.414198,-0.167084,-0.160148,0.387119,-0.194502,-0.183695,0.395064,...,0.72836,0.675939,0.703368,0.681176,0.581422,0.633229,0.716224,0.758735,0.663426,0.728528
2093,sub-88059397,6,HEALTHY,0.346646,-0.127832,-0.125848,0.358515,-0.131966,-0.130152,0.336782,...,0.784685,0.697352,0.745172,0.70732,0.668015,0.728186,0.752003,0.772532,0.708964,0.771122
590,sub-87974841,3,HEALTHY,0.399027,-0.156863,-0.14461,0.301474,-0.094987,-0.096058,0.25127,...,0.775076,0.764817,0.751841,0.756274,0.716758,0.741123,0.811943,0.777369,0.768977,0.776461


### EO features

In [30]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_eo.columns if 'EO' in num_col]
 
X = df_stat_conn_features_eo[numeric_cols]
Y = df_stat_conn_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [31]:
selected_columns = df_stat_conn_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_eo = df_stat_conn_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_eo.shape)
df_selected_stat_conn_feat_eo.sample(5)

(2700, 276)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_theta_std,EO_l_frontal_theta_mean,EO_r_frontal_theta_std,EO_r_frontal_theta_mean,EO_r_frontal_theta_median,EO_l_central_theta_std,EO_l_central_theta_mean,...,EO_gamma_m_central-r_central,EO_gamma_m_central-l_posterior,EO_gamma_m_central-m_posterior,EO_gamma_m_central-r_posterior,EO_gamma_r_central-l_posterior,EO_gamma_r_central-m_posterior,EO_gamma_r_central-r_posterior,EO_gamma_l_posterior-m_posterior,EO_gamma_l_posterior-r_posterior,EO_gamma_m_posterior-r_posterior
2564,sub-88073521,9,ADHD,0.36904,-0.130609,0.333611,-0.103903,-0.108409,0.300973,-0.089251,...,0.926426,0.893795,0.9046,0.896193,0.882448,0.898397,0.906914,0.918846,0.907781,0.919347
44,sub-87966473,9,SMC,0.330724,-0.100442,0.298581,-0.095506,-0.097355,0.345872,-0.130806,...,0.77675,0.728496,0.754226,0.719271,0.702038,0.747659,0.781849,0.80568,0.743799,0.807446
97,sub-87967325,2,SMC,0.290488,-0.088545,0.324689,-0.110936,-0.106055,0.295522,-0.090128,...,0.857844,0.830164,0.842875,0.83896,0.792319,0.824158,0.865513,0.867315,0.842301,0.871122
302,sub-87969125,3,SMC,0.361798,-0.127621,0.361973,-0.130708,-0.133362,0.334124,-0.13081,...,0.757515,0.729445,0.759878,0.723909,0.668298,0.710833,0.75227,0.816734,0.733191,0.789267
1352,sub-88039057,9,MDD,0.287262,-0.090314,0.280382,-0.083377,-0.081038,0.288889,-0.096891,...,0.811852,0.78849,0.810385,0.761601,0.775973,0.805451,0.82195,0.821838,0.779955,0.811165


### Merge selected ratio, EC, EO features

In [32]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_stat_conn_feat_eo = df_selected_stat_conn_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_stat_conn_feat_ratio = df_selected_stat_conn_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_stat_conn_features = pd.concat([df_selected_stat_conn_feat_ec, df_selected_stat_conn_feat_eo, df_selected_stat_conn_feat_ratio], axis=1)
df_selected_stat_conn_features

Unnamed: 0,ID,epoch,diagnosis,EC_r_central_delta_std,EC_r_central_delta_mean,EC_r_central_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87964717,1,SMC,0.396625,-0.174628,-0.177734,0.296711,-0.103983,-0.104952,0.316582,...,0.983375,0.982205,0.973501,0.957588,0.965101,0.975819,0.977929,0.973242,0.955600,0.985688
1,sub-87964717,2,SMC,0.394062,-0.158610,-0.156196,0.320765,-0.101048,-0.091661,0.319229,...,0.951962,0.999252,0.989583,0.969516,0.930350,0.938956,0.968311,0.982378,0.955147,0.965178
2,sub-87964717,3,SMC,0.410010,-0.196367,-0.187606,0.383631,-0.142807,-0.139421,0.369464,...,1.007210,1.003786,0.997969,0.981911,1.021233,1.027465,1.020506,0.977939,0.966586,0.985563
3,sub-87964717,4,SMC,0.469550,-0.230582,-0.225645,0.400738,-0.190151,-0.198076,0.349989,...,0.987823,1.029448,1.023404,1.011507,0.995531,1.025771,1.032580,0.990293,0.982638,1.006213
4,sub-87964717,5,SMC,0.355055,-0.131668,-0.134253,0.344149,-0.130853,-0.132630,0.345981,...,1.042865,1.021432,1.011774,1.023296,1.010551,1.047893,1.041591,1.019603,1.005809,1.024247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sub-88076717,8,OCD,0.474489,-0.223597,-0.222273,0.398381,-0.153613,-0.153747,0.385733,...,1.050150,1.093657,1.076960,1.060231,1.153361,1.122004,1.055451,1.082843,1.109466,1.075791
2696,sub-88076717,9,OCD,0.351207,-0.151888,-0.148171,0.280391,-0.083200,-0.082427,0.269471,...,1.095957,1.144254,1.111357,1.115784,1.178204,1.139658,1.100500,1.109036,1.142876,1.095822
2697,sub-88076717,10,OCD,0.367805,-0.150474,-0.144268,0.394338,-0.156866,-0.157289,0.364809,...,1.059378,1.114762,1.086334,1.146659,1.183257,1.136850,1.136010,1.085056,1.136426,1.107047
2698,sub-88076717,11,OCD,0.420107,-0.182264,-0.170567,0.321075,-0.114733,-0.113885,0.355278,...,1.048840,1.054833,1.083075,1.104262,1.037404,1.049401,1.063298,1.047424,1.064664,1.070277


In [33]:
df_selected_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_conn_features.pkl')