## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [1]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle
import os

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [2]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
7273,sub-88071949,60,1.0,MDD,2,0.16868,0.2,0.2,3.116735,11.511835,...,0.311999,1.04731,1.000477,1.088472,1.791012,0.414166,1.068323,0.953492,1.582792,2.383129
3752,sub-88039325,48,1.0,MDD,9,0.186247,0.2,0.2,2.052963,4.232184,...,0.425491,1.081598,1.007331,1.667232,4.06952,0.317394,1.055687,1.035326,1.099747,2.31843
5765,sub-88057869,28,0.0,HEALTHY,6,0.121956,0.2,0.2,1.208803,1.986963,...,0.302155,1.040452,1.01324,0.385636,-0.392551,0.311762,1.043178,1.035781,0.498187,-0.031483
3099,sub-88029833,46,1.0,MDD,4,0.18203,0.2,0.2,2.785748,10.101949,...,0.457252,1.064678,0.96073,1.12741,1.623219,0.411469,1.059645,1.007971,0.769105,0.21251
2871,sub-88027129,63,0.0,MDD,4,0.127204,0.2,0.2,0.701024,-0.313077,...,0.307858,1.050595,0.990633,1.001896,1.480925,0.335147,1.056289,1.006957,0.924052,1.017339
7143,sub-88070285,61,0.0,MDD,4,0.163312,0.2,0.2,1.471972,1.770233,...,0.653432,1.160359,1.057652,1.637996,3.830987,0.567746,1.128301,1.050633,1.348286,2.713012
3240,sub-88032753,13,1.0,ADHD,1,0.131222,0.2,0.2,1.093803,1.323133,...,0.490617,1.118036,0.985654,1.292537,1.350926,0.496779,1.117683,1.004234,1.280876,2.074884


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step. Additionally, for a still unknown reason, during feature extraction some participants data seem to be lossed, possibly due to the marked 'BAD' files from preprocessing.

In [3]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_stat_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_stat_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [4]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EO') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('EO'))]]

### Ratio features

In [5]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=5, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=3, n_estimators=135)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	44
Tentative: 	75
Rejected: 	106
Iteration: 	9 / 100
Confirmed: 	44
Tentative: 	75
Rejected: 	106
Iteration: 	10 / 100
Confirmed: 	44
Tentative: 	75
Rejected: 	106
Iteration: 	11 / 100
Confirmed: 	44
Tentative: 	75
Rejected: 	106
Iteration: 	12 / 100
Confirmed: 	50
Tentative: 	56
Rejected: 	119
Iteration: 	13 / 100
Confirmed: 	50
Tentative: 	56
Rejected: 	119
Iteration: 	14 / 100
Confirmed: 	50
Tentative: 	56
Rejected: 	119
Iteration: 	15 / 100
Confirmed: 	50
Tentative: 	56
Rejected: 	119
Iteration: 	16 / 100
Confirmed: 	

In [6]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

(2700, 66)


Unnamed: 0,ID,epoch,diagnosis,ratio_r_frontal_delta_mean,ratio_m_central_delta_mean,ratio_r_posterior_delta_mean,ratio_r_frontal_theta_median,ratio_r_central_theta_mean,ratio_r_posterior_theta_mean,ratio_l_frontal_alpha_std,...,ratio_r_central_gamma_mean,ratio_l_posterior_gamma_std,ratio_l_posterior_gamma_mean,ratio_l_posterior_gamma_median,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median
3903,sub-88041261,4,MDD,2.003224,2.119469,1.532933,0.88078,1.386825,1.271055,1.981035,...,1.079731,0.313765,1.047906,0.970155,0.348967,1.055056,0.958547,0.465183,1.085805,0.953161
1324,sub-87980913,5,HEALTHY,1.450606,1.312201,1.455224,1.075115,1.265228,1.199601,1.302651,...,1.1065,0.363248,1.062128,0.967569,0.469983,1.074639,0.977212,0.46524,1.054431,0.924736
61,sub-87966337,2,SMC,1.786957,1.56075,1.426331,0.990533,1.210854,1.251926,0.899747,...,2.22792,0.384504,1.09577,1.075511,0.411858,1.120291,1.096891,0.713548,1.28876,1.132063
997,sub-87971241,2,SMC,1.954192,2.360694,1.455691,1.013099,1.672661,1.247597,0.685097,...,1.081375,0.616229,1.138961,0.974003,0.511144,1.069508,0.933093,0.441337,1.053814,0.977888
5563,sub-88054937,8,OCD,1.718662,1.578264,1.729553,1.454259,1.839202,1.211841,1.440488,...,1.088304,0.309989,1.054799,1.061732,0.326782,1.058263,1.005193,0.323157,1.070157,1.007133


### EC features

In [7]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ec.columns if 'EC' in num_col]
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=10, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	9 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	10 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	11 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	12 / 100
Confirmed: 	25
Tentative: 	27
Rejected: 	173
Iteration: 	13 / 100
Confirmed: 	25
Tentative: 	27
Rejected: 	173
Iteration: 	14 / 100
Confirmed: 	25
Tentative: 	27
Rejected: 	173
Iteration: 	15 / 100
Confirmed: 	25
Tentative: 	27
Rejected: 	173
Iteration: 	16 / 100
Confirmed: 	

In [8]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 35)


Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_theta_std,EC_r_frontal_theta_std,EC_r_central_theta_std,EC_m_posterior_theta_std,EC_l_frontal_alpha_std,EC_m_frontal_alpha_std,EC_r_frontal_alpha_std,...,EC_l_central_gamma_skew,EC_m_central_gamma_std,EC_m_central_gamma_skew,EC_r_central_gamma_std,EC_r_central_gamma_skew,EC_l_posterior_gamma_std,EC_l_posterior_gamma_skew,EC_m_posterior_gamma_std,EC_r_posterior_gamma_std,EC_r_posterior_gamma_skew
6948,sub-88068981,1,OCD,0.108951,0.089886,0.095733,0.124795,0.137705,0.132616,0.131015,...,0.655195,0.052877,0.427583,0.055133,0.156472,0.063522,0.753148,0.157907,0.060459,0.209123
2566,sub-88024697,11,MDD,0.123111,0.159501,0.104905,0.101477,0.111448,0.126182,0.118773,...,-0.04415,0.051325,-0.022581,0.048027,-0.033781,0.05724,0.01962,0.054745,0.052163,-0.065585
7042,sub-88069649,11,MDD,0.139336,0.154673,0.187674,0.127402,0.175334,0.154354,0.154633,...,0.756983,0.07607,0.733246,0.081648,0.805561,0.07742,0.810562,0.076596,0.079092,0.846677
1872,sub-88013761,1,OCD,0.091943,0.108748,0.110516,0.091062,0.107061,0.121067,0.121116,...,0.619862,0.061948,0.390598,0.062493,0.383662,0.06298,0.274757,0.064289,0.066039,0.331973
388,sub-87967869,5,SMC,0.127497,0.137819,0.149251,0.144283,0.125154,0.127311,0.135406,...,0.329537,0.049787,0.500112,0.04958,0.408481,0.049941,0.323892,0.051879,0.055741,0.012808


### EO features

In [9]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_eo.columns if 'EO' in num_col]
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=10, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	9 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	10 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	11 / 100
Confirmed: 	23
Tentative: 	35
Rejected: 	167
Iteration: 	12 / 100
Confirmed: 	23
Tentative: 	22
Rejected: 	180
Iteration: 	13 / 100
Confirmed: 	23
Tentative: 	22
Rejected: 	180
Iteration: 	14 / 100
Confirmed: 	23
Tentative: 	22
Rejected: 	180
Iteration: 	15 / 100
Confirmed: 	23
Tentative: 	22
Rejected: 	180
Iteration: 	16 / 100
Confirmed: 	

In [10]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 29)


Unnamed: 0,ID,epoch,diagnosis,EO_l_central_theta_std,EO_r_central_theta_std,EO_l_frontal_alpha_std,EO_m_frontal_alpha_std,EO_r_frontal_alpha_std,EO_l_central_alpha_std,EO_m_central_alpha_std,...,EO_m_frontal_gamma_std,EO_r_frontal_gamma_std,EO_l_central_gamma_std,EO_m_central_gamma_std,EO_r_central_gamma_std,EO_l_posterior_gamma_std,EO_m_posterior_gamma_std,EO_m_posterior_gamma_skew,EO_r_posterior_gamma_std,EO_r_posterior_gamma_skew
952,sub-87971021,5,SMC,0.145748,0.137303,0.190433,0.225691,0.204266,0.132179,0.152887,...,0.123443,0.088551,0.098761,0.101528,0.077844,0.057044,0.053514,0.397197,0.051488,0.167537
178,sub-87966789,11,SMC,0.105888,0.123861,0.117108,0.119604,0.122632,0.111361,0.127422,...,0.051472,0.065147,0.044687,0.044296,0.051516,0.098009,0.071494,1.589827,0.054754,0.620328
4855,sub-88048729,8,HEALTHY,0.08539,0.088237,0.15601,0.13783,0.110616,0.139208,0.152762,...,0.060073,0.054395,0.070285,0.06549,0.060263,0.059224,0.057233,0.607032,0.05698,0.3837
5416,sub-88053677,5,OCD,0.143328,0.154357,0.285872,0.25905,0.24844,0.254546,0.268179,...,0.347029,0.518007,0.257568,0.344079,0.480013,0.367662,0.358712,3.145637,0.375996,3.171278
6464,sub-88064837,9,MDD,0.129625,0.093266,0.256212,0.230666,0.206458,0.25536,0.223412,...,0.071372,0.083213,0.069349,0.064185,0.068807,0.072058,0.070959,1.463925,0.081098,1.761482


### Merge selected ratio, EC, EO features

In [11]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_theta_std,EC_r_frontal_theta_std,EC_r_central_theta_std,EC_m_posterior_theta_std,EC_l_frontal_alpha_std,EC_m_frontal_alpha_std,EC_r_frontal_alpha_std,...,ratio_r_central_gamma_mean,ratio_l_posterior_gamma_std,ratio_l_posterior_gamma_mean,ratio_l_posterior_gamma_median,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median
24,sub-87964717,1,SMC,0.146601,0.134083,0.126677,0.088821,0.133251,0.136564,0.126442,...,1.087600,0.483367,1.070833,0.945090,0.404591,1.052324,0.958792,0.493553,1.086141,0.973209
25,sub-87964717,2,SMC,0.103510,0.113882,0.113146,0.093144,0.120214,0.108708,0.122549,...,1.057341,0.496462,1.094712,1.012856,0.549874,1.108695,0.950054,0.581599,1.117381,0.948814
26,sub-87964717,3,SMC,0.126494,0.153072,0.176219,0.122526,0.105422,0.108432,0.145150,...,1.118931,0.422118,1.080215,1.006534,0.504356,1.089061,1.006077,0.541452,1.098838,0.973932
27,sub-87964717,4,SMC,0.199366,0.190631,0.161905,0.155581,0.128919,0.106647,0.107669,...,1.240129,0.609446,1.253339,1.197622,0.475679,1.146048,1.109712,0.460338,1.128810,1.110328
28,sub-87964717,5,SMC,0.145020,0.181982,0.153659,0.130231,0.142192,0.167562,0.161236,...,1.125797,0.702789,1.121589,0.921769,0.640190,1.100259,0.930693,0.628968,1.105704,0.952507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,sub-88076717,8,OCD,0.148191,0.173885,0.203578,0.131782,0.124016,0.134765,0.140447,...,1.106774,0.430855,1.075240,0.965426,0.440777,1.082105,0.990665,0.407258,1.083663,0.996010
7808,sub-88076717,9,OCD,0.111555,0.139117,0.116225,0.116095,0.104099,0.111799,0.118723,...,1.054210,0.320705,1.039691,0.993901,0.292085,1.034767,1.005944,0.313329,1.050033,1.023995
7809,sub-88076717,10,OCD,0.159637,0.184458,0.188101,0.169570,0.107770,0.121271,0.135420,...,1.086880,0.394367,1.069753,1.013555,0.430236,1.088069,1.028141,0.421369,1.095421,1.021315
7810,sub-88076717,11,OCD,0.139784,0.164089,0.166881,0.150072,0.125906,0.128966,0.142744,...,1.123323,0.667891,1.143056,1.000576,0.419430,1.076165,1.001551,0.448825,1.085113,1.031082


In [12]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. statistical TFR + connectivity features

In [13]:
df_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_connectivity_features.pkl')
df_features = df_features.dropna(subset=['diagnosis'])

In [14]:
# subsample dataset to rebalance dataset
df_conn_subsample = df_conn_features[(df_conn_features['ID'].isin(df_ids_subsample_index))] # use same sampled IDs from earlier for the stat features
df_conn_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [15]:
# merge stat and conn features and store for later training of GCNs
df_stat_conn_features = pd.merge(df_stat_subsample, df_conn_subsample.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
# drop age and gender columns
df_stat_conn_features = df_stat_conn_features.drop(columns=['age', 'gender'])
df_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_conn_features.pkl')
print(df_stat_conn_features.shape)
df_stat_conn_features.sample(3)

(2700, 1218)


Unnamed: 0,ID,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_m_frontal_delta_std,EO_m_frontal_delta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
158,sub-87967781,SMC,3,0.184909,0.2,0.2,2.17146,5.753834,0.136602,0.2,...,0.978163,1.049595,1.044583,0.995937,1.050426,1.047645,1.015708,1.007729,1.011348,1.029767
1590,sub-88047789,MDD,7,0.150474,0.2,0.2,1.226924,2.115696,0.126341,0.2,...,1.031051,1.052111,1.04347,1.070464,1.05753,1.046055,1.066234,1.048837,1.082087,1.069819
1525,sub-88045809,MDD,2,0.116537,0.2,0.2,0.784385,0.22385,0.115017,0.2,...,1.046713,1.06035,1.061318,1.064623,1.066857,1.041233,0.999901,1.069585,1.10077,1.076392


In [16]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_conn_features_ec = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EO') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_eo = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_ratio = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('EO'))]]

In [17]:
# concat connectivity features with stat features
df_stat_conn_features_ec = pd.merge(df_features_ec, df_conn_features_ec.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_eo = pd.merge(df_features_eo, df_conn_features_eo.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_ratio = pd.merge(df_features_ratio, df_conn_features_ratio.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
print(df_stat_conn_features_ec.shape)



(2700, 410)


### Ratio features

In [18]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ratio.columns if 'ratio' in num_col]
 
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=10, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=195)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	127
Tentative: 	138
Rejected: 	140
Iteration: 	9 / 100
Confirmed: 	127
Tentative: 	138
Rejected: 	140
Iteration: 	10 / 100
Confirmed: 	127
Tentative: 	138
Rejected: 	140
Iteration: 	11 / 100
Confirmed: 	127
Tentative: 	138
Rejected: 	140
Iteration: 	12 / 100
Confirmed: 	141
Tentative: 	99
Rejected: 	165
Iteration: 	13 / 100
Confirmed: 	141
Tentative: 	99
Rejected: 	165
Iteration: 	14 / 100
Confirmed: 	141
Tentative: 	99
Rejected: 	165
Iteration: 	15 / 100
Confirmed: 	141
Tentative: 	99
Rejected: 	165
Iteration: 	16 / 100


In [19]:
selected_columns = df_stat_conn_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ratio = df_stat_conn_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ratio.shape)
df_selected_stat_conn_feat_ratio.sample(5)

(2700, 163)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_alpha_mean,ratio_m_frontal_alpha_std,ratio_m_frontal_alpha_mean,ratio_r_frontal_alpha_std,ratio_r_frontal_alpha_mean,ratio_l_central_alpha_std,ratio_l_central_alpha_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
1352,sub-88039057,9,MDD,1.213141,1.327848,1.256289,1.193689,1.273507,0.888743,1.195313,...,0.991579,0.91628,0.94246,0.969335,0.893161,0.933583,0.933522,0.942397,0.920569,0.944964
653,sub-87976457,6,HEALTHY,1.312502,0.832912,1.101114,1.020761,1.230748,1.546617,1.489874,...,1.076991,1.119261,1.113363,1.134421,1.161495,1.134454,1.128749,1.045884,1.116939,1.087178
2420,sub-88068305,9,OCD,1.735111,2.19184,1.823405,1.621107,1.564413,2.967017,1.762768,...,1.0607,1.016922,1.017605,1.031636,1.055644,1.052823,1.024429,1.003283,1.013044,1.014297
462,sub-87970881,7,SMC,1.681079,3.013073,1.783206,2.367164,1.519516,2.335599,1.63241,...,1.029815,0.969744,0.953663,0.940683,1.01332,1.018825,1.012468,0.977051,0.958037,0.96487
566,sub-87974665,3,HEALTHY,1.263135,1.265745,1.300396,1.027637,1.298083,1.672523,1.45108,...,0.991367,0.987678,0.973874,0.971294,1.057333,1.049983,1.043067,0.999315,1.031435,1.06139


In [20]:
# count the number of selected statistical features (columns containing ['std', 'mean', 'median', 'skew', 'kurt'])
stat_selected = [col for col in df_selected_stat_conn_feat_ratio.columns if any(x in col for x in ['std', 'mean', 'median', 'skew', 'kurt'])]
print(len(stat_selected))

53


### EC features

In [21]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ec.columns if 'EC' in num_col]
 
X = df_stat_conn_features_ec[numeric_cols]
Y = df_stat_conn_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=15, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=175)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	194
Tentative: 	53
Rejected: 	158
Iteration: 	9 / 100
Confirmed: 	194
Tentative: 	53
Rejected: 	158
Iteration: 	10 / 100
Confirmed: 	194
Tentative: 	53
Rejected: 	158
Iteration: 	11 / 100
Confirmed: 	194
Tentative: 	53
Rejected: 	158
Iteration: 	12 / 100
Confirmed: 	196
Tentative: 	51
Rejected: 	158
Iteration: 	13 / 100
Confirmed: 	196
Tentative: 	51
Rejected: 	158
Iteration: 	14 / 100
Confirmed: 	196
Tentative: 	43
Rejected: 	166
Iteration: 	15 / 100
Confirmed: 	196
Tentative: 	43
Rejected: 	166
Iteration: 	16 / 100
Conf

In [22]:
selected_columns = df_stat_conn_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ec = df_stat_conn_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ec.shape)
df_selected_stat_conn_feat_ec.sample(5)

(2700, 220)


Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_theta_std,EC_m_frontal_theta_std,EC_r_frontal_theta_std,EC_l_central_theta_std,EC_r_central_theta_std,EC_l_posterior_theta_std,EC_m_posterior_theta_std,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
536,sub-87971373,9,SMC,0.114785,0.114865,0.103978,0.10714,0.124846,0.105897,0.092273,...,0.749505,0.663992,0.716134,0.674772,0.580052,0.664514,0.745458,0.759964,0.633281,0.737054
587,sub-87974709,12,HEALTHY,0.108914,0.133384,0.136674,0.106039,0.144736,0.111079,0.119342,...,0.895765,0.861933,0.869849,0.863044,0.854949,0.867756,0.879852,0.900204,0.874141,0.896521
57,sub-87966517,10,SMC,0.106868,0.09747,0.09138,0.098085,0.085251,0.119849,0.098861,...,0.830535,0.781098,0.799996,0.767622,0.747995,0.790709,0.814956,0.829541,0.782852,0.843145
1622,sub-88048549,3,OCD,0.094043,0.097753,0.098636,0.096663,0.092537,0.093024,0.093737,...,0.775829,0.706324,0.747617,0.700154,0.692422,0.757365,0.786716,0.811664,0.732162,0.799504
2593,sub-88073885,2,OCD,0.117982,0.107039,0.096011,0.104052,0.116916,0.109236,0.118895,...,0.777775,0.688952,0.722259,0.694607,0.664303,0.717218,0.750099,0.776573,0.735147,0.778172


### EO features

In [23]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_eo.columns if 'EO' in num_col]
 
X = df_stat_conn_features_eo[numeric_cols]
Y = df_stat_conn_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=20, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators=185)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	186
Tentative: 	48
Rejected: 	171
Iteration: 	9 / 100
Confirmed: 	186
Tentative: 	48
Rejected: 	171
Iteration: 	10 / 100
Confirmed: 	186
Tentative: 	48
Rejected: 	171
Iteration: 	11 / 100
Confirmed: 	186
Tentative: 	48
Rejected: 	171
Iteration: 	12 / 100
Confirmed: 	191
Tentative: 	43
Rejected: 	171
Iteration: 	13 / 100
Confirmed: 	191
Tentative: 	43
Rejected: 	171
Iteration: 	14 / 100
Confirmed: 	191
Tentative: 	43
Rejected: 	171
Iteration: 	15 / 100
Confirmed: 	191
Tentative: 	40
Rejected: 	174
Iteration: 	16 / 100
Conf

In [24]:
selected_columns = df_stat_conn_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_eo = df_stat_conn_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_eo.shape)
df_selected_stat_conn_feat_eo.sample(5)

(2700, 208)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_alpha_std,EO_m_frontal_alpha_std,EO_r_frontal_alpha_std,EO_l_central_alpha_std,EO_m_central_alpha_std,EO_r_central_alpha_std,EO_l_posterior_alpha_std,...,EO_gamma_m_central-r_central,EO_gamma_m_central-l_posterior,EO_gamma_m_central-m_posterior,EO_gamma_m_central-r_posterior,EO_gamma_r_central-l_posterior,EO_gamma_r_central-m_posterior,EO_gamma_r_central-r_posterior,EO_gamma_l_posterior-m_posterior,EO_gamma_l_posterior-r_posterior,EO_gamma_m_posterior-r_posterior
1062,sub-88022001,7,MDD,0.170621,0.194335,0.164815,0.123891,0.13972,0.150749,0.16928,...,0.90235,0.882493,0.882733,0.878334,0.876694,0.884839,0.899373,0.900934,0.88794,0.897253
873,sub-88000313,10,MDD,0.152858,0.141632,0.140625,0.172173,0.136408,0.155713,0.207446,...,0.75583,0.685375,0.716817,0.685904,0.650058,0.704888,0.732322,0.764316,0.703978,0.76596
2031,sub-88058001,4,HEALTHY,0.105697,0.147422,0.171927,0.114545,0.126812,0.136839,0.132618,...,0.79567,0.724958,0.750816,0.722474,0.671388,0.716272,0.732355,0.80949,0.752011,0.798749
1119,sub-88024697,4,MDD,0.091067,0.089897,0.085179,0.076782,0.088303,0.099702,0.157118,...,0.854742,0.809633,0.798962,0.805232,0.783208,0.777696,0.809034,0.857499,0.825579,0.837621
965,sub-88016105,6,MDD,0.139119,0.137748,0.144738,0.144742,0.129675,0.128514,0.133519,...,0.776565,0.70795,0.739443,0.735176,0.661362,0.715445,0.757211,0.787885,0.740052,0.784436


### Merge selected ratio, EC, EO features

In [25]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_stat_conn_feat_eo = df_selected_stat_conn_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_stat_conn_feat_ratio = df_selected_stat_conn_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_stat_conn_features = pd.concat([df_selected_stat_conn_feat_ec, df_selected_stat_conn_feat_eo, df_selected_stat_conn_feat_ratio], axis=1)
df_selected_stat_conn_features

Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_theta_std,EC_m_frontal_theta_std,EC_r_frontal_theta_std,EC_l_central_theta_std,EC_r_central_theta_std,EC_l_posterior_theta_std,EC_m_posterior_theta_std,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87964717,1,SMC,0.146601,0.140457,0.134083,0.111772,0.126677,0.097456,0.088821,...,0.983375,0.982205,0.973501,0.957588,0.965101,0.975819,0.977929,0.973242,0.955600,0.985688
1,sub-87964717,2,SMC,0.103510,0.106254,0.113882,0.104289,0.113146,0.089767,0.093144,...,0.951962,0.999252,0.989583,0.969516,0.930350,0.938956,0.968311,0.982378,0.955147,0.965178
2,sub-87964717,3,SMC,0.126494,0.129298,0.153072,0.116544,0.176219,0.101275,0.122526,...,1.007210,1.003786,0.997969,0.981911,1.021233,1.027465,1.020506,0.977939,0.966586,0.985563
3,sub-87964717,4,SMC,0.199366,0.181563,0.190631,0.172736,0.161905,0.160612,0.155581,...,0.987823,1.029448,1.023404,1.011507,0.995531,1.025771,1.032580,0.990293,0.982638,1.006213
4,sub-87964717,5,SMC,0.145020,0.183063,0.181982,0.120705,0.153659,0.112695,0.130231,...,1.042865,1.021432,1.011774,1.023296,1.010551,1.047893,1.041591,1.019603,1.005809,1.024247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sub-88076717,8,OCD,0.148191,0.145226,0.173885,0.156004,0.203578,0.119112,0.131782,...,1.050150,1.093657,1.076960,1.060231,1.153361,1.122004,1.055451,1.082843,1.109466,1.075791
2696,sub-88076717,9,OCD,0.111555,0.120504,0.139117,0.126790,0.116225,0.123273,0.116095,...,1.095957,1.144254,1.111357,1.115784,1.178204,1.139658,1.100500,1.109036,1.142876,1.095822
2697,sub-88076717,10,OCD,0.159637,0.170118,0.184458,0.234596,0.188101,0.208276,0.169570,...,1.059378,1.114762,1.086334,1.146659,1.183257,1.136850,1.136010,1.085056,1.136426,1.107047
2698,sub-88076717,11,OCD,0.139784,0.157755,0.164089,0.131705,0.166881,0.116801,0.150072,...,1.048840,1.054833,1.083075,1.104262,1.037404,1.049401,1.063298,1.047424,1.064664,1.070277


In [26]:
df_selected_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_conn_features.pkl')