## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [1]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle
import os

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [2]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,ratio_r_posterior_beta_std,ratio_r_posterior_beta_mean,ratio_r_posterior_beta_median,ratio_r_posterior_beta_skew,ratio_r_posterior_beta_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
230,sub-87967057,74,0.0,SMC,3,0.131317,0.091946,0.091946,3.326113,12.44258,...,0.579151,0.901946,0.760345,1.104035,1.355222,0.308126,0.515739,0.439457,2.147945,7.484372
5469,sub-88054173,40,1.0,ADHD,10,0.057125,0.079952,0.079952,1.112481,0.682541,...,1.012567,1.110645,0.778183,2.241343,7.032785,0.101951,0.255756,0.239199,1.375131,3.180725
6659,sub-88066729,55,1.0,MDD,12,0.044415,0.068792,0.068792,1.948363,5.399892,...,0.858547,1.095156,0.773212,1.741732,2.656008,0.278072,0.637446,0.595991,1.145482,2.196289
1724,sub-88010033,31,0.0,MDD,9,0.097276,0.11537,0.11537,2.038411,4.445757,...,2.580043,2.160914,1.096419,2.925703,10.336452,1.176784,1.072574,0.662941,2.62079,6.757755
1618,sub-88006477,44,0.0,MDD,11,0.053302,0.055145,0.055145,2.1772,5.317627,...,0.70459,1.09455,0.907928,1.748422,3.573143,0.520738,1.005755,0.873124,2.047487,6.556207
3028,sub-88029557,27,0.0,ADHD,5,0.083229,0.083219,0.083219,2.246496,5.5912,...,0.993092,1.448815,1.134454,1.679474,2.715663,0.297821,0.427952,0.349677,2.642864,9.802142
511,sub-87968405,59,0.0,SMC,8,0.054586,0.067259,0.067259,1.980829,4.419257,...,0.581078,0.580018,0.367718,2.184806,5.318295,0.020762,0.042778,0.037304,1.84608,4.875539


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step. Additionally, for a still unknown reason, during feature extraction some participants data seem to be lossed, possibly due to the marked 'BAD' files from preprocessing.

In [3]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_stat_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_stat_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [4]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EO') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('EO'))]]

### Ratio features

In [5]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=3, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	54
Tentative: 	50
Rejected: 	121
Iteration: 	9 / 100
Confirmed: 	54
Tentative: 	50
Rejected: 	121
Iteration: 	10 / 100
Confirmed: 	54
Tentative: 	50
Rejected: 	121
Iteration: 	11 / 100
Confirmed: 	54
Tentative: 	50
Rejected: 	121
Iteration: 	12 / 100
Confirmed: 	60
Tentative: 	44
Rejected: 	121
Iteration: 	13 / 100
Confirmed: 	60
Tentative: 	39
Rejected: 	126
Iteration: 	14 / 100
Confirmed: 	60
Tentative: 	39
Rejected: 	126
Iteration: 	15 / 100
Confirmed: 	60
Tentative: 	39
Rejected: 	126
Iteration: 	16 / 100
Confirmed: 	

In [6]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

(2700, 69)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_theta_mean,ratio_l_frontal_theta_median,ratio_l_frontal_gamma_std,ratio_l_frontal_gamma_mean,ratio_l_frontal_gamma_median,ratio_m_frontal_theta_mean,ratio_m_frontal_beta_median,...,ratio_r_posterior_delta_median,ratio_r_posterior_theta_mean,ratio_r_posterior_theta_median,ratio_r_posterior_alpha_median,ratio_r_posterior_beta_std,ratio_r_posterior_beta_mean,ratio_r_posterior_beta_median,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median
1138,sub-87976413,11,HEALTHY,1.164398,0.873991,0.278515,0.513499,0.448008,1.202425,1.420343,...,0.813469,1.183523,0.642713,1.440177,1.357555,1.595337,1.146901,0.913251,1.023929,0.767025
1322,sub-87980913,3,HEALTHY,2.865208,1.735721,0.672841,0.789467,0.657561,2.367281,1.058802,...,0.483612,1.514127,1.037735,3.816115,1.406119,1.962848,1.598665,0.261337,0.391174,0.37746
5468,sub-88054173,9,ADHD,2.300222,1.325076,0.174439,0.306549,0.266772,2.122678,1.491304,...,0.478715,2.105531,1.350527,2.303889,0.44878,0.67673,0.538431,0.05786,0.112486,0.097701
4076,sub-88042661,9,ADHD,3.249032,2.050577,2.422123,1.427004,0.581138,3.946551,0.914172,...,1.269519,3.750454,1.918219,3.391514,0.721442,0.73548,0.445581,0.719139,0.468172,0.22947
1042,sub-87974617,11,HEALTHY,1.630862,1.187309,0.342146,0.864981,0.80286,1.48344,1.50712,...,0.731819,1.781351,1.014684,1.900307,0.956504,1.614549,1.405128,3.02493,3.174462,2.530971


### EC features

In [7]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ec.columns if 'EC' in num_col]
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=200)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	135
Tentative: 	32
Rejected: 	58
Iteration: 	9 / 100
Confirmed: 	135
Tentative: 	32
Rejected: 	58
Iteration: 	10 / 100
Confirmed: 	135
Tentative: 	32
Rejected: 	58
Iteration: 	11 / 100
Confirmed: 	135
Tentative: 	32
Rejected: 	58
Iteration: 	12 / 100
Confirmed: 	141
Tentative: 	26
Rejected: 	58
Iteration: 	13 / 100
Confirmed: 	141
Tentative: 	18
Rejected: 	66
Iteration: 	14 / 100
Confirmed: 	141
Tentative: 	18
Rejected: 	66
Iteration: 	15 / 100
Confirmed: 	141
Tentative: 	18
Rejected: 	66
Iteration: 	16 / 100
Confirmed: 	

In [8]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 150)


Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_delta_mean,EC_l_frontal_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_l_frontal_alpha_std,EC_l_frontal_alpha_mean,...,EC_r_posterior_alpha_median,EC_r_posterior_alpha_skew,EC_r_posterior_beta_std,EC_r_posterior_beta_mean,EC_r_posterior_beta_median,EC_r_posterior_gamma_std,EC_r_posterior_gamma_mean,EC_r_posterior_gamma_median,EC_r_posterior_gamma_skew,EC_r_posterior_gamma_kurt
2190,sub-88019033,7,ADHD,0.049442,0.049442,0.036766,0.048052,0.048052,0.054522,0.06929,...,0.081476,2.268587,0.011888,0.031144,0.031144,0.002748,0.009717,0.009717,0.219625,0.058755
2536,sub-88023529,5,MDD,0.075909,0.075909,0.033465,0.048818,0.048818,0.066115,0.05985,...,0.042445,1.636998,0.017411,0.022479,0.022479,0.001506,0.004912,0.004912,0.575311,-0.286306
326,sub-87967509,3,SMC,0.05058,0.05058,0.038611,0.067996,0.067996,0.055883,0.07875,...,0.075164,2.161599,0.003428,0.007617,0.007617,0.000428,0.001477,0.001477,0.304032,-0.01798
6758,sub-88067493,3,ADHD,0.086632,0.086632,0.032141,0.067233,0.067233,0.022857,0.040665,...,0.057605,0.83386,0.007148,0.014202,0.014202,0.00108,0.004127,0.004127,0.33533,-0.117589
3456,sub-88035229,1,MDD,0.097505,0.097505,0.041207,0.07134,0.07134,0.017822,0.032206,...,0.047941,0.747524,0.004921,0.015025,0.015025,0.00098,0.003169,0.003169,1.76078,4.571128


### EO features

In [9]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_eo.columns if 'EO' in num_col]
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	118
Tentative: 	37
Rejected: 	70
Iteration: 	9 / 100
Confirmed: 	118
Tentative: 	37
Rejected: 	70
Iteration: 	10 / 100
Confirmed: 	118
Tentative: 	37
Rejected: 	70
Iteration: 	11 / 100
Confirmed: 	118
Tentative: 	37
Rejected: 	70
Iteration: 	12 / 100
Confirmed: 	121
Tentative: 	34
Rejected: 	70
Iteration: 	13 / 100
Confirmed: 	121
Tentative: 	27
Rejected: 	77
Iteration: 	14 / 100
Confirmed: 	121
Tentative: 	27
Rejected: 	77
Iteration: 	15 / 100
Confirmed: 	121
Tentative: 	27
Rejected: 	77
Iteration: 	16 / 100
Confirmed: 	

In [10]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 138)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_theta_std,EO_l_frontal_theta_mean,EO_l_frontal_theta_median,EO_l_frontal_alpha_std,...,EO_r_posterior_alpha_std,EO_r_posterior_alpha_mean,EO_r_posterior_alpha_median,EO_r_posterior_beta_std,EO_r_posterior_beta_mean,EO_r_posterior_beta_median,EO_r_posterior_gamma_std,EO_r_posterior_gamma_mean,EO_r_posterior_gamma_median,EO_r_posterior_gamma_skew
7259,sub-88071677,12,MDD,0.080671,0.075655,0.075655,0.04288,0.056677,0.056677,0.02886,...,0.021461,0.046086,0.046086,0.011956,0.029646,0.029646,0.003663,0.011674,0.011674,1.194249
5638,sub-88056021,11,ADHD,0.063258,0.108238,0.108238,0.042544,0.072637,0.072637,0.015852,...,0.065366,0.054795,0.054795,0.00232,0.004575,0.004575,9.8e-05,0.000294,0.000294,1.553898
2895,sub-88027577,4,ADHD,0.064711,0.104482,0.104482,0.037772,0.065109,0.065109,0.020732,...,0.012652,0.031962,0.031962,0.003389,0.008637,0.008637,0.000729,0.002004,0.002004,1.004588
4990,sub-88049813,11,MDD,0.073421,0.085408,0.085408,0.03991,0.064025,0.064025,0.021117,...,0.032505,0.04556,0.04556,0.00671,0.013511,0.013511,0.001145,0.004198,0.004198,0.221794
5364,sub-88053453,1,HEALTHY,0.057831,0.088464,0.088464,0.031597,0.065411,0.065411,0.020111,...,0.020702,0.033904,0.033904,0.006913,0.012628,0.012628,0.000327,0.00124,0.00124,0.395833


### Merge selected ratio, EC, EO features

In [11]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_delta_mean,EC_l_frontal_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_l_frontal_alpha_std,EC_l_frontal_alpha_mean,...,ratio_r_posterior_delta_median,ratio_r_posterior_theta_mean,ratio_r_posterior_theta_median,ratio_r_posterior_alpha_median,ratio_r_posterior_beta_std,ratio_r_posterior_beta_mean,ratio_r_posterior_beta_median,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median
24,sub-87964717,1,SMC,0.038548,0.038548,0.038556,0.052609,0.052609,0.063854,0.091404,...,0.510265,1.349741,0.968623,2.153139,1.198007,1.644368,1.278645,0.429809,0.727186,0.632788
25,sub-87964717,2,SMC,0.064845,0.064845,0.030842,0.056860,0.056860,0.041251,0.068083,...,1.032241,1.986691,1.382212,1.536313,0.634587,0.918662,0.722950,0.165883,0.292965,0.252897
26,sub-87964717,3,SMC,0.060132,0.060132,0.032789,0.049954,0.049954,0.040715,0.072963,...,0.865775,1.875845,1.188569,1.681806,0.710918,1.168534,0.985158,0.220094,0.396351,0.343460
27,sub-87964717,4,SMC,0.068201,0.068201,0.061580,0.059407,0.059407,0.040781,0.060521,...,1.053052,1.827830,1.173045,1.639288,0.644873,0.873910,0.704022,0.175540,0.296261,0.264274
28,sub-87964717,5,SMC,0.060723,0.060723,0.035185,0.048441,0.048441,0.054241,0.074711,...,0.940136,1.289571,0.896923,1.350853,0.370200,0.759917,0.699207,0.221074,0.424809,0.350868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,sub-88076717,8,OCD,0.052290,0.052290,0.053702,0.070063,0.070063,0.041325,0.066680,...,1.219489,2.867796,1.820891,2.639839,3.794338,4.965127,3.652019,1.843660,3.089657,2.700059
7808,sub-88076717,9,OCD,0.068988,0.068988,0.041480,0.073598,0.073598,0.029228,0.054087,...,0.629160,1.753572,1.330836,2.605999,2.561313,2.929883,2.185694,0.545471,1.459109,1.403218
7809,sub-88076717,10,OCD,0.076952,0.076952,0.060294,0.072454,0.072454,0.026849,0.047361,...,0.673403,1.728405,1.069284,1.497162,1.769562,2.767741,2.322909,1.098164,2.350843,2.186206
7810,sub-88076717,11,OCD,0.087063,0.087063,0.045478,0.061230,0.061230,0.032789,0.051911,...,1.129796,2.133611,1.025203,1.266488,1.822750,2.245996,1.705901,1.059606,1.930280,1.660883


In [12]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. statistical TFR + connectivity features

In [13]:
df_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_connectivity_features.pkl')
df_features = df_features.dropna(subset=['diagnosis'])

In [14]:
# subsample dataset to rebalance dataset
df_conn_subsample = df_conn_features[(df_conn_features['ID'].isin(df_ids_subsample_index))] # use same sampled IDs from earlier for the stat features
df_conn_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [15]:
# merge stat and conn features and store for later training of GCNs
df_stat_conn_features = pd.merge(df_stat_subsample, df_conn_subsample.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
# drop age and gender columns
df_stat_conn_features = df_stat_conn_features.drop(columns=['age', 'gender'])
df_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_conn_features.pkl')
print(df_stat_conn_features.shape)
df_stat_conn_features.sample(3)

(2700, 1218)


Unnamed: 0,ID,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_l_frontal_theta_std,EO_l_frontal_theta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
2142,sub-88061325,ADHD,7,0.047699,0.07002,0.07002,1.450376,2.170555,0.073392,0.081435,...,1.005611,1.010197,1.015933,1.031216,0.989211,1.001109,1.016366,1.00521,0.999851,0.997668
1227,sub-88028661,ADHD,4,0.072364,0.118348,0.118348,0.779897,0.262668,0.031054,0.068841,...,1.058662,1.091577,1.092001,1.088674,1.133577,1.103998,1.088202,1.089568,1.083906,1.042405
2373,sub-88067449,ADHD,10,0.076451,0.098355,0.098355,1.552793,2.56209,0.043211,0.066036,...,0.995175,1.021824,1.005824,1.007725,1.035576,1.023221,1.027151,1.008641,1.022826,1.006733


In [16]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_conn_features_ec = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EO') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_eo = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_ratio = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('EO'))]]

In [17]:
# concat connectivity features with stat features
df_stat_conn_features_ec = pd.merge(df_features_ec, df_conn_features_ec.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_eo = pd.merge(df_features_eo, df_conn_features_eo.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_ratio = pd.merge(df_features_ratio, df_conn_features_ratio.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
print(df_stat_conn_features_ec.shape)



(2700, 410)


### Ratio features

In [18]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ratio.columns if 'ratio' in num_col]
 
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=15, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	115
Tentative: 	148
Rejected: 	142
Iteration: 	9 / 100
Confirmed: 	115
Tentative: 	148
Rejected: 	142
Iteration: 	10 / 100
Confirmed: 	115
Tentative: 	148
Rejected: 	142
Iteration: 	11 / 100
Confirmed: 	115
Tentative: 	148
Rejected: 	142
Iteration: 	12 / 100
Confirmed: 	142
Tentative: 	121
Rejected: 	142
Iteration: 	13 / 100
Confirmed: 	142
Tentative: 	110
Rejected: 	153
Iteration: 	14 / 100
Confirmed: 	142
Tentative: 	110
Rejected: 	153
Iteration: 	15 / 100
Confirmed: 	142
Tentative: 	110
Rejected: 	153
Iteration: 	16 / 

In [19]:
selected_columns = df_stat_conn_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ratio = df_stat_conn_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ratio.shape)
df_selected_stat_conn_feat_ratio.sample(5)

(2700, 185)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_theta_mean,ratio_l_frontal_theta_median,ratio_l_frontal_beta_median,ratio_l_frontal_gamma_std,ratio_l_frontal_gamma_mean,ratio_l_frontal_gamma_median,ratio_m_frontal_theta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
277,sub-87968901,2,SMC,1.809353,1.297778,2.18552,3.988424,6.329305,5.82688,2.070605,...,0.917071,1.011223,1.025837,1.013874,0.895589,0.904536,0.92522,1.018816,0.998344,0.998188
1537,sub-88046349,2,MDD,2.08458,0.658338,0.185791,0.080309,0.189506,0.167204,1.929247,...,0.964027,0.948596,0.939847,0.930337,0.948964,0.961293,0.973996,0.965902,0.960743,0.968686
2400,sub-88067945,1,OCD,1.78801,0.974302,1.290766,1.21101,2.384949,2.092951,1.613582,...,1.059051,1.059347,1.053278,1.071033,1.100112,1.063313,1.064923,1.054521,1.080874,1.043291
2016,sub-88057869,1,HEALTHY,1.533297,0.987835,1.9569,1.097599,1.976693,1.751531,1.394283,...,1.039702,1.101394,1.068633,1.088429,1.103761,1.051398,1.043351,1.066353,1.072289,1.059547
74,sub-87967057,3,SMC,2.388149,1.430944,1.401702,1.40306,2.272396,1.933186,2.400292,...,0.943752,0.898081,0.936956,0.962804,0.870618,0.921746,0.976167,0.941017,0.940644,0.978442


In [20]:
# count the number of selected statistical features (columns containing ['std', 'mean', 'median', 'skew', 'kurt'])
stat_selected = [col for col in df_selected_stat_conn_feat_ratio.columns if any(x in col for x in ['std', 'mean', 'median', 'skew', 'kurt'])]
print(len(stat_selected))

79


### EC features

In [21]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ec.columns if 'EC' in num_col]
 
X = df_stat_conn_features_ec[numeric_cols]
Y = df_stat_conn_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	308
Tentative: 	30
Rejected: 	67
Iteration: 	9 / 100
Confirmed: 	308
Tentative: 	30
Rejected: 	67
Iteration: 	10 / 100
Confirmed: 	308
Tentative: 	30
Rejected: 	67
Iteration: 	11 / 100
Confirmed: 	308
Tentative: 	30
Rejected: 	67
Iteration: 	12 / 100
Confirmed: 	310
Tentative: 	28
Rejected: 	67
Iteration: 	13 / 100
Confirmed: 	310
Tentative: 	28
Rejected: 	67
Iteration: 	14 / 100
Confirmed: 	310
Tentative: 	28
Rejected: 	67
Iteration: 	15 / 100
Confirmed: 	310
Tentative: 	24
Rejected: 	71
Iteration: 	16 / 100
Confirmed: 	

In [22]:
selected_columns = df_stat_conn_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ec = df_stat_conn_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ec.shape)
df_selected_stat_conn_feat_ec.sample(5)

(2700, 322)


Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_delta_std,EC_l_frontal_delta_mean,EC_l_frontal_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_l_frontal_alpha_std,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
2018,sub-88057869,3,HEALTHY,0.065805,0.087256,0.087256,0.032833,0.059346,0.059346,0.031374,...,0.784119,0.757785,0.759586,0.75719,0.784141,0.791562,0.826607,0.824762,0.830226,0.831501
359,sub-87969665,12,SMC,0.029795,0.031681,0.031681,0.040106,0.056517,0.056517,0.069036,...,0.943236,0.892916,0.911694,0.892759,0.885239,0.909828,0.91053,0.90364,0.866558,0.896179
70,sub-87966789,11,SMC,0.029605,0.046889,0.046889,0.024943,0.043534,0.043534,0.063595,...,0.783991,0.714249,0.731389,0.705721,0.66614,0.707568,0.753655,0.758176,0.692077,0.756937
2216,sub-88064253,9,OCD,0.039437,0.051549,0.051549,0.022063,0.045536,0.045536,0.04783,...,0.907615,0.854666,0.882998,0.837875,0.833697,0.878451,0.860206,0.871731,0.806258,0.86997
1827,sub-88053677,4,OCD,0.039084,0.042453,0.042453,0.02998,0.048746,0.048746,0.043958,...,0.82033,0.780389,0.810172,0.773212,0.741344,0.786376,0.795513,0.839005,0.782971,0.844915


### EO features

In [23]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_eo.columns if 'EO' in num_col]
 
X = df_stat_conn_features_eo[numeric_cols]
Y = df_stat_conn_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=190)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	283
Tentative: 	49
Rejected: 	73
Iteration: 	9 / 100
Confirmed: 	283
Tentative: 	49
Rejected: 	73
Iteration: 	10 / 100
Confirmed: 	283
Tentative: 	49
Rejected: 	73
Iteration: 	11 / 100
Confirmed: 	283
Tentative: 	49
Rejected: 	73
Iteration: 	12 / 100
Confirmed: 	295
Tentative: 	37
Rejected: 	73
Iteration: 	13 / 100
Confirmed: 	295
Tentative: 	37
Rejected: 	73
Iteration: 	14 / 100
Confirmed: 	295
Tentative: 	37
Rejected: 	73
Iteration: 	15 / 100
Confirmed: 	295
Tentative: 	33
Rejected: 	77
Iteration: 	16 / 100
Confirmed: 	

In [24]:
selected_columns = df_stat_conn_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_eo = df_stat_conn_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_eo.shape)
df_selected_stat_conn_feat_eo.sample(5)

(2700, 304)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_theta_std,EO_l_frontal_theta_mean,EO_l_frontal_theta_median,EO_l_frontal_alpha_std,EO_l_frontal_alpha_mean,...,EO_gamma_m_central-r_central,EO_gamma_m_central-l_posterior,EO_gamma_m_central-m_posterior,EO_gamma_m_central-r_posterior,EO_gamma_r_central-l_posterior,EO_gamma_r_central-m_posterior,EO_gamma_r_central-r_posterior,EO_gamma_l_posterior-m_posterior,EO_gamma_l_posterior-r_posterior,EO_gamma_m_posterior-r_posterior
1053,sub-88020153,10,MDD,0.047093,0.047093,0.051526,0.071701,0.071701,0.04502,0.064309,...,0.738773,0.761486,0.768719,0.732534,0.680734,0.712608,0.786745,0.829309,0.763347,0.800853
1641,sub-88048729,10,HEALTHY,0.075992,0.075992,0.027999,0.058786,0.058786,0.031386,0.048063,...,0.817315,0.714306,0.749558,0.709733,0.707782,0.757056,0.774414,0.815801,0.771993,0.80957
533,sub-87971373,6,SMC,0.068209,0.068209,0.023388,0.049326,0.049326,0.073008,0.056803,...,0.715499,0.675497,0.728627,0.684925,0.593672,0.672614,0.729554,0.763374,0.667829,0.766798
841,sub-87982225,2,HEALTHY,0.105756,0.105756,0.03929,0.060664,0.060664,0.017109,0.031653,...,0.816292,0.816227,0.824331,0.811274,0.787168,0.801175,0.812497,0.857148,0.828976,0.844081
2021,sub-88057869,6,HEALTHY,0.095775,0.095775,0.033913,0.07026,0.07026,0.022717,0.034513,...,0.717423,0.679339,0.697567,0.690916,0.710698,0.740873,0.778434,0.759898,0.767092,0.772105


### Merge selected ratio, EC, EO features

In [25]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_stat_conn_feat_eo = df_selected_stat_conn_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_stat_conn_feat_ratio = df_selected_stat_conn_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_stat_conn_features = pd.concat([df_selected_stat_conn_feat_ec, df_selected_stat_conn_feat_eo, df_selected_stat_conn_feat_ratio], axis=1)
df_selected_stat_conn_features

Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_delta_std,EC_l_frontal_delta_mean,EC_l_frontal_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_l_frontal_alpha_std,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87964717,1,SMC,0.026398,0.038548,0.038548,0.038556,0.052609,0.052609,0.063854,...,0.983375,0.982205,0.973501,0.957588,0.965101,0.975819,0.977929,0.973242,0.955600,0.985688
1,sub-87964717,2,SMC,0.058279,0.064845,0.064845,0.030842,0.056860,0.056860,0.041251,...,0.951962,0.999252,0.989583,0.969516,0.930350,0.938956,0.968311,0.982378,0.955147,0.965178
2,sub-87964717,3,SMC,0.059938,0.060132,0.060132,0.032789,0.049954,0.049954,0.040715,...,1.007210,1.003786,0.997969,0.981911,1.021233,1.027465,1.020506,0.977939,0.966586,0.985563
3,sub-87964717,4,SMC,0.075938,0.068201,0.068201,0.061580,0.059407,0.059407,0.040781,...,0.987823,1.029448,1.023404,1.011507,0.995531,1.025771,1.032580,0.990293,0.982638,1.006213
4,sub-87964717,5,SMC,0.052394,0.060723,0.060723,0.035185,0.048441,0.048441,0.054241,...,1.042865,1.021432,1.011774,1.023296,1.010551,1.047893,1.041591,1.019603,1.005809,1.024247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sub-88076717,8,OCD,0.039935,0.052290,0.052290,0.053702,0.070063,0.070063,0.041325,...,1.050150,1.093657,1.076960,1.060231,1.153361,1.122004,1.055451,1.082843,1.109466,1.075791
2696,sub-88076717,9,OCD,0.048182,0.068988,0.068988,0.041480,0.073598,0.073598,0.029228,...,1.095957,1.144254,1.111357,1.115784,1.178204,1.139658,1.100500,1.109036,1.142876,1.095822
2697,sub-88076717,10,OCD,0.086283,0.076952,0.076952,0.060294,0.072454,0.072454,0.026849,...,1.059378,1.114762,1.086334,1.146659,1.183257,1.136850,1.136010,1.085056,1.136426,1.107047
2698,sub-88076717,11,OCD,0.093845,0.087063,0.087063,0.045478,0.061230,0.061230,0.032789,...,1.048840,1.054833,1.083075,1.104262,1.037404,1.049401,1.063298,1.047424,1.064664,1.070277


In [26]:
df_selected_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_conn_features.pkl')