## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [1]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [2]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_frontal_delta_std,EO_frontal_delta_mean,EO_frontal_delta_median,EO_frontal_delta_skew,EO_frontal_delta_kurt,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
6495,sub-88065197,59,1.0,MDD,4,0.402594,-0.173878,-0.160064,-0.326887,-0.004349,...,1.074327,1.096971,1.090395,1.416404,1.636298,1.158259,1.212559,1.197677,2.857314,5.91426
2925,sub-88027713,39,0.0,MDD,10,0.353482,-0.135726,-0.136254,-0.28959,0.44279,...,,,,,,,,,,
4509,sub-88045809,43,1.0,MDD,10,0.345698,-0.126597,-0.120706,-0.366557,0.32396,...,0.993529,0.970952,0.879674,1.293692,3.034661,1.00618,1.015828,1.01459,0.970026,1.367261
4579,sub-88046305,32,1.0,ADHD,8,0.361966,-0.145183,-0.133885,-0.29676,0.267601,...,0.996554,0.953888,0.997309,1.056836,0.621444,1.019225,1.060079,0.946704,0.863734,1.591785
3733,sub-88039193,7,1.0,ADHD,2,0.409534,-0.179495,-0.163032,-0.34786,0.17642,...,1.149721,1.719383,1.62704,-3.709325,2.463859,1.210845,1.795088,1.910657,-0.740165,1.831596
910,sub-87970705,77,1.0,SMC,11,0.429331,-0.198743,-0.187977,-0.245703,0.199065,...,1.079447,1.149445,1.247354,1.222287,3.644735,1.051292,1.056332,1.114315,1.524241,2.649423
5030,sub-88050081,11,0.0,ADHD,3,0.369446,-0.136168,-0.130305,-0.565447,0.296085,...,0.651615,0.326599,0.340897,-0.817426,0.580296,0.693785,0.353069,0.415476,-1.160827,0.968208


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step.

In [3]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [4]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EO') | df_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_subsample[df_features.columns[~(df_subsample.columns.str.startswith('EC') | df_subsample.columns.str.startswith('EO'))]]

In [5]:
# channel groupings
frontal = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'FC3', 'FCz', 'FC4']
central = ['T7', 'C3', 'Cz', 'C4', 'T8']
parietal = ['CP3','CPz', 'CP4', 'P7', 'P3', 'Pz', 'P4', 'P8'] 
occipital = ['O1', 'Oz', 'O2']
channel_groups = {'frontal': frontal, 'central': central, 'parietal': parietal, 'occipital': occipital}

# define (5) frequencies of interest for TFR per frequency band
delta = np.array([1, 1.5, 2, 2.5, 3]) # starting at one because of high-pass filter
theta = np.array([4, 4.75, 5.5, 6.25, 7])
alpha = np.array([8, 9, 10, 11, 12])
beta = np.array([13, 17.25, 21.5, 25.75, 30])
gamma = np.array([42, 54, 66, 78, 90]) 
bands = {'delta': delta, 'theta': theta, 'alpha': alpha, 'beta': beta, 'gamma': gamma}

### Ratio features

In [6]:
# define numeric columns
numeric_cols = []
for cond in ['ratio']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [15]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

(2700, 47)


Unnamed: 0,ID,epoch,diagnosis,ratio_frontal_delta_median,ratio_frontal_delta_skew,ratio_central_delta_skew,ratio_parietal_delta_skew,ratio_occipital_delta_skew,ratio_frontal_theta_std,ratio_frontal_theta_mean,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
1256,sub-87980241,9,HEALTHY,1.267147,0.231345,0.064359,0.463078,2.840686,0.948749,0.999695,...,1.017946,1.054241,1.009118,0.852617,0.672806,1.02729,1.064611,1.080077,0.883717,0.721891
6629,sub-88066457,6,OCD,0.87658,1.056424,0.922057,0.476081,0.106165,0.785078,0.532941,...,0.915092,0.927233,0.828066,0.588595,1.785136,0.9312,0.927815,0.868644,1.127093,-82.325195
6904,sub-88068665,5,MDD,0.687538,0.22928,1.57524,1.745218,-27.636263,0.813915,0.822469,...,0.952916,0.933875,0.937722,0.883384,1.046396,1.0796,1.195692,1.386447,0.640032,0.134864
6084,sub-88061193,1,OCD,1.049854,0.63139,0.48289,0.878726,-0.851745,1.07344,1.152974,...,0.904385,0.86442,0.848427,0.769232,0.574813,0.896724,0.840701,0.86798,0.849355,0.802514
577,sub-87968677,2,SMC,0.623139,0.941062,1.185358,0.871997,-0.036575,0.837867,0.713413,...,1.069951,1.132486,1.140683,1.088763,0.770957,1.011923,1.006712,1.012405,1.181259,1.692074


### EC features

In [21]:
# define numeric columns
numeric_cols = []
for cond in ['EC']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=155, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [22]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 47)


Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_occipital_delta_std,EC_frontal_theta_std,EC_frontal_theta_mean,EC_frontal_theta_median,EC_central_theta_std,...,EC_central_gamma_skew,EC_central_gamma_kurt,EC_parietal_gamma_std,EC_parietal_gamma_mean,EC_parietal_gamma_median,EC_parietal_gamma_skew,EC_occipital_gamma_std,EC_occipital_gamma_mean,EC_occipital_gamma_median,EC_occipital_gamma_skew
5417,sub-88053677,6,OCD,0.346186,-0.126356,0.431059,0.371089,-0.153797,-0.151898,0.352847,...,-0.542078,0.142949,0.286944,-0.085715,-0.086391,-0.576318,0.297519,-0.095309,-0.089621,-0.473478
3466,sub-88035229,11,MDD,0.381996,-0.140536,0.373675,0.314289,-0.100397,-0.0912,0.30909,...,-0.638062,0.70589,0.271531,-0.07777,-0.075144,-0.497964,0.280378,-0.085072,-0.090011,-0.353303
981,sub-87971109,10,SMC,0.356517,-0.126531,0.378812,0.286886,-0.089017,-0.082905,0.294076,...,-0.676466,0.721964,0.259949,-0.069742,-0.068549,-0.593979,0.26643,-0.073254,-0.072948,-0.594332
779,sub-87969937,12,SMC,0.333607,-0.123515,0.333874,0.280643,-0.081344,-0.074788,0.275849,...,-0.516769,0.372727,0.27344,-0.077042,-0.071187,-0.571808,0.288559,-0.086514,-0.091021,-0.502072
562,sub-87968541,11,SMC,0.411626,-0.183933,0.359617,0.310404,-0.107739,-0.106258,0.323405,...,-0.55614,0.528888,0.267081,-0.073026,-0.071052,-0.601315,0.265887,-0.071856,-0.070453,-0.659663


### EO features

In [23]:
# define numeric columns
numeric_cols = []
for cond in ['EO']:
    for band in bands:
        for group in channel_groups:
            for stat in ['std', 'mean', 'median', 'skew', 'kurt']:
                numeric_cols.append(f'{cond}_{group}_{band}_{stat}')
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=180, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [24]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 55)


Unnamed: 0,ID,epoch,diagnosis,EO_frontal_delta_median,EO_frontal_delta_skew,EO_central_delta_std,EO_central_delta_mean,EO_central_delta_median,EO_parietal_delta_std,EO_occipital_delta_skew,...,EO_central_gamma_median,EO_central_gamma_skew,EO_parietal_gamma_std,EO_parietal_gamma_mean,EO_parietal_gamma_median,EO_parietal_gamma_skew,EO_occipital_gamma_std,EO_occipital_gamma_mean,EO_occipital_gamma_median,EO_occipital_gamma_skew
1280,sub-87980373,9,HEALTHY,-0.118508,-0.12716,0.375616,-0.158388,-0.154008,0.367489,-0.333889,...,-0.093893,-0.348711,0.299159,-0.09797,-0.089295,-0.354729,0.293818,-0.096528,-0.092687,-0.253533
4984,sub-88049813,5,MDD,-0.169518,-0.593636,0.422799,-0.180745,-0.17949,0.416829,-0.514643,...,-0.081722,-0.467212,0.268108,-0.074917,-0.074755,-0.508259,0.266573,-0.073085,-0.075445,-0.554656
991,sub-87971197,8,SMC,-0.129451,-0.431434,0.380726,-0.172384,-0.151781,0.365987,-0.503973,...,-0.080907,-0.519339,0.285118,-0.084749,-0.077966,-0.581304,0.269349,-0.075142,-0.075527,-0.498993
6361,sub-88064345,2,ADHD,-0.13301,-0.569376,0.346866,-0.12456,-0.122064,0.32989,-0.93783,...,-0.084261,-0.645176,0.293824,-0.088474,-0.083555,-0.590357,0.304322,-0.091507,-0.086935,-0.748766
648,sub-87968945,1,SMC,-0.130079,-0.448475,0.349376,-0.124305,-0.126467,0.359604,-0.344471,...,-0.083916,-0.600275,0.296474,-0.09108,-0.091095,-0.583274,0.297731,-0.093311,-0.092972,-0.401414


### Merge selected ratio, EC, EO features

In [25]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

Unnamed: 0,ID,epoch,diagnosis,EC_central_delta_std,EC_central_delta_mean,EC_occipital_delta_std,EC_frontal_theta_std,EC_frontal_theta_mean,EC_frontal_theta_median,EC_central_theta_std,...,ratio_parietal_gamma_std,ratio_parietal_gamma_mean,ratio_parietal_gamma_median,ratio_parietal_gamma_skew,ratio_parietal_gamma_kurt,ratio_occipital_gamma_std,ratio_occipital_gamma_mean,ratio_occipital_gamma_median,ratio_occipital_gamma_skew,ratio_occipital_gamma_kurt
24,sub-87964717,1,SMC,0.388111,-0.158658,0.332231,0.327564,-0.118993,-0.116429,0.339098,...,0.968281,1.022076,1.052374,0.469458,0.126383,1.051424,1.123622,1.156979,0.866754,1.785548
25,sub-87964717,2,SMC,0.404008,-0.166120,0.371482,0.330950,-0.109538,-0.109491,0.321337,...,0.994825,0.998219,1.003191,0.944901,1.384317,1.020286,1.098013,1.087812,0.544720,0.002118
26,sub-87964717,3,SMC,0.424449,-0.191903,0.333949,0.392801,-0.156261,-0.154654,0.363305,...,1.071320,1.153472,1.159519,0.889397,1.053610,1.156190,1.299332,1.286826,1.068119,0.834607
27,sub-87964717,4,SMC,0.455659,-0.224328,0.451149,0.389299,-0.172491,-0.175309,0.375920,...,0.924726,0.751021,0.873962,4.524715,1.097089,0.956484,0.916655,1.080320,0.780077,0.596561
28,sub-87964717,5,SMC,0.366557,-0.142051,0.364635,0.364516,-0.157834,-0.157847,0.346362,...,1.132596,1.413028,1.312625,0.472916,1.821612,1.133601,1.252463,1.128770,1.247167,1.738256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7903,sub-88077525,8,MDD,0.376217,-0.146065,0.363059,0.294275,-0.094149,-0.092135,0.293015,...,0.979383,0.979756,0.968850,0.882507,-2.986603,0.972764,0.956172,0.903145,0.984097,-1.452343
7904,sub-88077525,9,MDD,0.404842,-0.167801,0.341771,0.325576,-0.109965,-0.107863,0.342199,...,1.176680,1.304158,1.240184,1.272699,1.516883,1.187179,1.332789,1.340861,1.232894,1.124988
7905,sub-88077525,10,MDD,0.319906,-0.106238,0.358224,0.273953,-0.075649,-0.075369,0.254059,...,0.952579,0.866925,0.832524,1.484224,2.400569,0.952937,0.872762,0.780062,1.398447,1.328253
7906,sub-88077525,11,MDD,0.348354,-0.127219,0.338369,0.284891,-0.083445,-0.084718,0.275252,...,0.979437,0.962729,0.933522,1.123375,1.001902,0.996582,0.992473,0.909772,1.143214,0.975487


In [30]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. stat TFR features + connectivity features