## AIM: reduce dimensionality by feature selection with BORUTA

A seperate BORUTA will be performed for each feature set (6 total):
1. statistical TFR features (for ratio, EC, and EO)
2. stat TFR features + connectivity features (for ratio, EC, and EO)

n_estimates & max_depth for RFs in BORUTA were determined for each feature set in 'boruta_hyperparameter_tuning.ipynb' located in the 'background_notebooks' subdir

In [1]:
from boruta import BorutaPy
import pandas as pd
import numpy as np
import mne
import pickle
import os

from sklearn.ensemble import RandomForestClassifier

# 1. statistical TFR features

In [2]:
df_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_features.pkl')
df_features['age'] = df_features['age'].round().astype('Int32') # round age to nearest year and convert to int
df_features = df_features.dropna(subset=['diagnosis'])
df_features.sample(7)

Unnamed: 0,ID,age,gender,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,...,ratio_m_posterior_gamma_std,ratio_m_posterior_gamma_mean,ratio_m_posterior_gamma_median,ratio_m_posterior_gamma_skew,ratio_m_posterior_gamma_kurt,ratio_r_posterior_gamma_std,ratio_r_posterior_gamma_mean,ratio_r_posterior_gamma_median,ratio_r_posterior_gamma_skew,ratio_r_posterior_gamma_kurt
3517,sub-88035677,38,1.0,MDD,2,0.098979,-0.14101,-0.14101,-0.284329,-0.62288,...,4.215776,1.121205,0.988509,20.563573,479.708482,3.480968,0.718548,0.989,-8.350096,128.916167
7551,sub-88074201,46,1.0,MDD,4,0.126424,-0.168686,-0.168686,0.096367,-0.427218,...,4.631886,1.664928,0.912184,10.581595,147.088128,4.108013,1.393911,0.784294,2.028103,40.992261
1370,sub-88000181,46,0.0,MDD,3,0.125347,-0.147083,-0.147083,0.057914,0.713123,...,15.678257,1.111586,0.984096,-11.288592,286.068449,4.128942,1.245182,1.0849,-4.911016,64.361537
2723,sub-88025597,23,0.0,MDD,12,0.105342,-0.140473,-0.140473,-0.490988,0.168104,...,2.473169,1.322508,1.122526,3.365191,91.775175,8.199124,1.158972,1.291637,-10.19145,255.040098
5024,sub-88050037,39,1.0,MDD,9,0.134984,-0.155155,-0.155155,0.484081,-0.133096,...,20.219338,1.258459,1.411296,-6.436791,154.029968,62.105123,0.317657,1.660085,-24.147007,593.400294
4776,sub-88048193,53,0.0,OCD,1,0.108061,-0.156556,-0.156556,0.240129,0.55254,...,18.226657,1.95466,0.937574,10.56305,210.440097,11.115452,1.869567,0.900923,16.432729,338.010041
3631,sub-88037977,21,1.0,ADHD,8,0.115198,-0.14173,-0.14173,-0.245264,-0.137219,...,2.848346,0.981649,0.848772,-10.219959,167.505634,10.456138,0.824152,0.859685,-22.730656,551.951123


Subsampling dataset here to rebalance dataset. Possibly would have been more efficient to perform before feature extraction, however I already extracted features of the full sample, which allows me to not have to recompute features if we ever change this subsampling step. Additionally, for a still unknown reason, during feature extraction some participants data seem to be lossed, possibly due to the marked 'BAD' files from preprocessing.

In [6]:
# subsample dataset to rebalance dataset
df_ids = df_features[(df_features['epoch'] == 1)] # select only first epoch to make sure we subsample from participants, and not from epochs
df_ids_subsample = df_ids.groupby('diagnosis').sample(45, random_state=42) # sample 45 participants per diagnosis (which is the max for OCD)
df_ids_subsample_index = df_ids_subsample['ID'].tolist()
df_stat_subsample = df_features[(df_features['ID'].isin(df_ids_subsample_index))]
df_stat_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [7]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_features_ec = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EO') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_eo = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('ratio'))]]
df_features_ratio = df_stat_subsample[df_features.columns[~(df_stat_subsample.columns.str.startswith('EC') | df_stat_subsample.columns.str.startswith('EO'))]]

### Ratio features

In [26]:
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the digits dataset
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ("scaler", StandardScaler()),
        ("rfe", RFE(estimator=rf, n_features_to_select=0.5, step=1, verbose=1)),
    ]
)

pipe.fit(X, Y)
print(pipe.named_steps["rfe"].ranking_)
print(pipe.named_steps["rfe"].support_)
print(pipe.named_steps["rfe"].n_features_)

Fitting estimator with 225 features.
Fitting estimator with 224 features.
Fitting estimator with 223 features.
Fitting estimator with 222 features.
Fitting estimator with 221 features.
Fitting estimator with 220 features.
Fitting estimator with 219 features.
Fitting estimator with 218 features.
Fitting estimator with 217 features.
Fitting estimator with 216 features.
Fitting estimator with 215 features.
Fitting estimator with 214 features.
Fitting estimator with 213 features.
Fitting estimator with 212 features.
Fitting estimator with 211 features.
Fitting estimator with 210 features.
Fitting estimator with 209 features.
Fitting estimator with 208 features.
Fitting estimator with 207 features.
Fitting estimator with 206 features.
Fitting estimator with 205 features.
Fitting estimator with 204 features.
Fitting estimator with 203 features.
Fitting estimator with 202 features.
Fitting estimator with 201 features.
Fitting estimator with 200 features.
Fitting estimator with 199 features.
F

In [27]:
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the digits dataset
numeric_cols = [num_col for num_col in df_stat_conn_features_ratio.columns if 'ratio' in num_col]
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ("scaler", StandardScaler()),
        ("rfe", RFE(estimator=rf, n_features_to_select=0.5, step=1, verbose=1)),
    ]
)

pipe.fit(X, Y)
print(pipe.named_steps["rfe"].ranking_)
print(pipe.named_steps["rfe"].support_)
print(pipe.named_steps["rfe"].n_features_)

Fitting estimator with 405 features.
Fitting estimator with 404 features.
Fitting estimator with 403 features.
Fitting estimator with 402 features.
Fitting estimator with 401 features.
Fitting estimator with 400 features.
Fitting estimator with 399 features.
Fitting estimator with 398 features.
Fitting estimator with 397 features.
Fitting estimator with 396 features.
Fitting estimator with 395 features.
Fitting estimator with 394 features.
Fitting estimator with 393 features.
Fitting estimator with 392 features.
Fitting estimator with 391 features.
Fitting estimator with 390 features.
Fitting estimator with 389 features.
Fitting estimator with 388 features.
Fitting estimator with 387 features.
Fitting estimator with 386 features.
Fitting estimator with 385 features.
Fitting estimator with 384 features.
Fitting estimator with 383 features.
Fitting estimator with 382 features.
Fitting estimator with 381 features.
Fitting estimator with 380 features.
Fitting estimator with 379 features.
F

In [12]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

from sklearn.model_selection import StratifiedGroupKFold
cv_outer = StratifiedGroupKFold(n_splits = 3, shuffle=False)
groups = df_features_ratio['ID'].tolist()

estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, cv=3, scoring='accuracy', verbose=10, n_jobs=-1)
selector = selector.fit(X, Y)
print(f"Optimal number of features: {selector.n_features_}")
selector.support_
selector.ranking_

KeyboardInterrupt: 

In [41]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X = pd.read_csv('test_X.csv', index_col=0).values
y = pd.read_csv('test_y.csv', header=None, index_col=0).values
y = y.ravel()

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	10
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	5
Tentative: 	0
Rejected: 	5


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	5
Tentative: 	0
Rejected: 	5


In [44]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ratio.columns if 'ratio' in num_col]
 
X = df_features_ratio[numeric_cols]
Y = df_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_jobs=-1, max_depth=5, class_weight='balanced')
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=10, n_estimators='auto', alpha=0.05)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	225
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	225
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	225
Tentative: 	0
Rejected: 	0


In [8]:
selected_columns = df_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ratio = df_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ratio.shape)
df_selected_feat_ratio.sample(5)

KeyError: "None of [Index(['EC_l_frontal_delta_std', 'EC_l_frontal_delta_mean',\n       'EC_l_frontal_delta_median', 'EC_l_frontal_delta_skew',\n       'EC_l_frontal_delta_kurt', 'EC_m_frontal_delta_std',\n       'EC_m_frontal_delta_mean', 'EC_m_frontal_delta_median',\n       'EC_m_frontal_delta_skew', 'EC_m_frontal_delta_kurt',\n       ...\n       'EC_m_posterior_gamma_std', 'EC_m_posterior_gamma_mean',\n       'EC_m_posterior_gamma_median', 'EC_m_posterior_gamma_skew',\n       'EC_m_posterior_gamma_kurt', 'EC_r_posterior_gamma_std',\n       'EC_r_posterior_gamma_mean', 'EC_r_posterior_gamma_median',\n       'EC_r_posterior_gamma_skew', 'EC_r_posterior_gamma_kurt'],\n      dtype='object', length=225)] are in the [columns]"

### EC features

In [21]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_ec.columns if 'EC' in num_col]
 
X = df_features_ec[numeric_cols]
Y = df_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	225
Tentative: 	0
Rejected: 	0


In [9]:
selected_columns = df_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_ec = df_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_ec.shape)
df_selected_feat_ec.sample(5)

(2700, 228)


Unnamed: 0,ID,epoch,diagnosis,EC_l_frontal_delta_std,EC_l_frontal_delta_mean,EC_l_frontal_delta_median,EC_l_frontal_delta_skew,EC_l_frontal_delta_kurt,EC_m_frontal_delta_std,EC_m_frontal_delta_mean,...,EC_m_posterior_gamma_std,EC_m_posterior_gamma_mean,EC_m_posterior_gamma_median,EC_m_posterior_gamma_skew,EC_m_posterior_gamma_kurt,EC_r_posterior_gamma_std,EC_r_posterior_gamma_mean,EC_r_posterior_gamma_median,EC_r_posterior_gamma_skew,EC_r_posterior_gamma_kurt
1087,sub-87974841,8,HEALTHY,0.102534,-0.146375,-0.146375,-0.021861,-0.305888,0.107041,-0.142822,...,0.047405,-0.073373,-0.073373,-1.524313,7.087042,0.049798,-0.074618,-0.074618,-0.864933,3.763034
1341,sub-87982225,10,HEALTHY,0.093281,-0.164481,-0.164481,-0.141673,1.32793,0.095284,-0.164969,...,0.049185,-0.078997,-0.078997,-1.421078,6.111397,0.047553,-0.07891,-0.07891,-1.594625,7.973869
1047,sub-87974621,4,HEALTHY,0.104186,-0.130048,-0.130048,-0.358229,0.247049,0.097501,-0.128235,...,0.043145,-0.077324,-0.077324,-1.735082,8.033426,0.043319,-0.075077,-0.075077,-1.888008,9.216856
5653,sub-88056649,2,OCD,0.118698,-0.155297,-0.155297,-0.647004,2.091686,0.124438,-0.16409,...,0.051262,-0.077876,-0.077876,-2.285922,10.468049,0.053622,-0.081559,-0.081559,-2.098021,10.360141
4759,sub-88047789,8,MDD,0.110855,-0.17278,-0.17278,-0.419452,-0.197955,0.096647,-0.154386,...,0.04518,-0.074299,-0.074299,-2.005487,10.054662,0.046309,-0.074482,-0.074482,-1.598395,7.260499


### EO features

In [17]:
# define numeric columns
numeric_cols = [num_col for num_col in df_features_eo.columns if 'EO' in num_col]
 
X = df_features_eo[numeric_cols]
Y = df_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=170, n_jobs=-1, max_depth=5)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [18]:
selected_columns = df_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_feat_eo = df_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_feat_eo.shape)
df_selected_feat_eo.sample(5)

(2700, 102)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_r_frontal_delta_mean,EO_m_central_delta_mean,EO_r_central_delta_std,EO_r_central_delta_mean,EO_r_central_delta_median,...,EO_m_posterior_gamma_std,EO_m_posterior_gamma_mean,EO_m_posterior_gamma_median,EO_m_posterior_gamma_skew,EO_m_posterior_gamma_kurt,EO_r_posterior_gamma_std,EO_r_posterior_gamma_mean,EO_r_posterior_gamma_median,EO_r_posterior_gamma_skew,EO_r_posterior_gamma_kurt
6933,sub-88068841,10,HEALTHY,-0.197953,-0.232924,-0.12544,-0.111258,0.390823,-0.144331,-0.146203,...,0.287532,-0.086424,-0.075811,-0.539149,0.352333,0.289671,-0.08778,-0.087915,-0.500158,0.252589
666,sub-87969125,7,SMC,0.032951,0.34045,-0.153825,-0.271177,0.402081,-0.190609,-0.183459,...,0.272821,-0.076283,-0.075875,-0.603762,0.522994,0.283451,-0.080963,-0.082509,-0.754849,1.7232
857,sub-87970345,6,SMC,-0.342508,0.367209,-0.128005,-0.139156,0.399094,-0.151797,-0.153386,...,0.265838,-0.074925,-0.077636,-0.448801,0.421761,0.265867,-0.072464,-0.072326,-0.618791,0.566448
1199,sub-87976773,12,HEALTHY,-0.687388,1.291325,-0.127022,-0.120564,0.375112,-0.13572,-0.137548,...,0.302442,-0.090793,-0.092133,-0.734105,0.664549,0.271916,-0.077775,-0.075948,-0.501906,0.473327
2977,sub-88028661,2,ADHD,-0.547852,0.290638,-0.134493,-0.123947,0.413656,-0.16271,-0.164255,...,0.309155,-0.108622,-0.10641,-0.120743,0.043035,0.336005,-0.131397,-0.120089,-0.149792,0.313014


### Merge selected ratio, EC, EO features

In [45]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_feat_eo = df_selected_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_feat_ratio = df_selected_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_features = pd.concat([df_selected_feat_ec, df_selected_feat_eo, df_selected_feat_ratio], axis=1)
df_selected_features

NameError: name 'df_selected_feat_eo' is not defined

In [20]:
df_selected_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_features.pkl')

# 2. statistical TFR + connectivity features

In [19]:
df_conn_features = pd.read_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_connectivity_features.pkl')
df_features = df_features.dropna(subset=['diagnosis'])

In [20]:
# subsample dataset to rebalance dataset
df_conn_subsample = df_conn_features[(df_conn_features['ID'].isin(df_ids_subsample_index))] # use same sampled IDs from earlier for the stat features
df_conn_subsample['diagnosis'].value_counts()

diagnosis
SMC        540
HEALTHY    540
MDD        540
ADHD       540
OCD        540
Name: count, dtype: int64

In [28]:
# merge stat and conn features and store for later training of GCNs
df_stat_conn_features = pd.merge(df_stat_subsample, df_conn_subsample.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
# drop age and gender columns
df_stat_conn_features = df_stat_conn_features.drop(columns=['age', 'gender'])
df_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_stat_conn_features.pkl')
print(df_stat_conn_features.shape)
df_stat_conn_features.sample(3)

(2700, 1218)


Unnamed: 0,ID,diagnosis,epoch,EO_l_frontal_delta_std,EO_l_frontal_delta_mean,EO_l_frontal_delta_median,EO_l_frontal_delta_skew,EO_l_frontal_delta_kurt,EO_m_frontal_delta_std,EO_m_frontal_delta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
1722,sub-88052241,ADHD,7,0.100076,-0.158545,-0.158545,-1.08266,2.865383,0.122315,-0.154653,...,0.96196,0.944384,0.953038,0.934521,0.90969,0.938715,0.952844,0.954012,0.938386,0.957157
2177,sub-88062949,OCD,6,0.097307,-0.140226,-0.140226,-0.367643,0.662852,0.109159,-0.147384,...,1.016275,1.004632,0.997673,1.005407,1.01831,1.007062,1.012593,1.001378,0.992259,0.983508
848,sub-87982225,HEALTHY,9,0.10244,-0.149278,-0.149278,-0.876528,1.445813,0.10446,-0.131979,...,0.99745,1.009344,0.998239,1.002707,1.008139,0.997294,0.998881,1.002626,1.007078,1.003518


In [29]:
# create 3 feature sets [EC, EO, ratio] with column ['ID', 'age', 'gender', 'diagnosis', 'epoch']
df_conn_features_ec = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EO') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_eo = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('ratio'))]]
df_conn_features_ratio = df_conn_subsample[df_conn_features.columns[~(df_conn_subsample.columns.str.startswith('EC') | df_conn_subsample.columns.str.startswith('EO'))]]

In [30]:
# concat connectivity features with stat features
df_stat_conn_features_ec = pd.merge(df_features_ec, df_conn_features_ec.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_eo = pd.merge(df_features_eo, df_conn_features_eo.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
df_stat_conn_features_ratio = pd.merge(df_features_ratio, df_conn_features_ratio.drop(columns=['diagnosis']),  how='outer', on=['ID', 'epoch'])
print(df_stat_conn_features_ec.shape)



(2700, 410)


### Ratio features

In [31]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ratio.columns if 'ratio' in num_col]
 
X = df_stat_conn_features_ratio[numeric_cols]
Y = df_stat_conn_features_ratio['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=15)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=3)
feature_selection = trans.fit_transform(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	405
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	357
Tentative: 	48
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	357
Tentative: 	48
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	357
Tentative: 	34
Rejected: 	14
Iteration: 	11 / 100
Confirmed: 	357
Tentative: 	34
Rejected: 	14
Iteration: 	12 / 100
Confirmed: 	364
Tentative: 	27
Rejected: 	14
Iteration: 	13 / 100
Confirmed: 	364
Tentative: 	27
Rejected: 	14
Iteration: 	14 / 100
Confirmed: 	364
Tentative: 	27
Rejected: 	14
Iteration: 	15 / 100
Confirmed: 	364
Tentative: 	27
Rejected: 	14
Iteration: 	16 / 100
Confirmed: 	36

In [52]:
selected_columns = df_stat_conn_features_ratio[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ratio = df_stat_conn_features_ratio[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ratio.shape)
df_selected_stat_conn_feat_ratio.sample(5)

(2700, 372)


Unnamed: 0,ID,epoch,diagnosis,ratio_l_frontal_delta_std,ratio_l_frontal_delta_mean,ratio_l_frontal_delta_median,ratio_l_frontal_delta_skew,ratio_l_frontal_delta_kurt,ratio_m_frontal_delta_std,ratio_m_frontal_delta_mean,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
1449,sub-88042837,10,MDD,36.036958,-0.356895,0.683848,-21.973963,533.513749,48.948798,0.955026,...,0.997038,0.960942,0.976912,0.987959,0.965173,0.993162,1.011417,0.984619,0.958168,0.978686
1044,sub-88020153,1,MDD,28.979502,1.87419,0.921036,8.385197,197.372772,26.8627,1.249883,...,1.027568,1.013674,1.024323,1.040437,1.004093,1.028365,1.037633,0.993042,1.00106,1.01616
1414,sub-88041893,11,HEALTHY,11.392377,1.504075,0.786672,24.405059,601.201978,2.799873,0.83195,...,1.00806,1.001965,1.007812,0.993754,1.013329,1.014439,1.010723,1.001079,0.988707,1.003822
2556,sub-88073521,1,ADHD,,,,,,,,...,,,,,,,,,,
2120,sub-88061149,9,OCD,4.363247,1.069917,0.683508,5.695638,169.980624,2.231637,1.050932,...,1.002484,0.98817,0.996666,1.000588,0.995859,0.999779,0.99769,0.998829,1.003773,1.000257


In [53]:
# count the number of selected statistical features (columns containing ['std', 'mean', 'median', 'skew', 'kurt'])
stat_selected = [col for col in df_selected_stat_conn_feat_ratio.columns if any(x in col for x in ['std', 'mean', 'median', 'skew', 'kurt'])]
print(len(stat_selected))

225


### EC features

In [28]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_ec.columns if 'EC' in num_col]
 
X = df_stat_conn_features_ec[numeric_cols]
Y = df_stat_conn_features_ec['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=185, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [29]:
selected_columns = df_stat_conn_features_ec[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_ec = df_stat_conn_features_ec[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_ec.shape)
df_selected_stat_conn_feat_ec.sample(5)

(2700, 275)


Unnamed: 0,ID,epoch,diagnosis,EC_r_central_delta_std,EC_r_central_delta_mean,EC_r_central_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,...,EC_gamma_m_central-r_central,EC_gamma_m_central-l_posterior,EC_gamma_m_central-m_posterior,EC_gamma_m_central-r_posterior,EC_gamma_r_central-l_posterior,EC_gamma_r_central-m_posterior,EC_gamma_r_central-r_posterior,EC_gamma_l_posterior-m_posterior,EC_gamma_l_posterior-r_posterior,EC_gamma_m_posterior-r_posterior
1899,sub-88054937,4,OCD,0.332098,-0.122867,-0.113763,0.328039,-0.111721,-0.108884,0.322662,...,0.729098,0.719448,0.734877,0.7091,0.672243,0.699778,0.726153,0.778882,0.725873,0.771979
793,sub-87980417,2,HEALTHY,0.355726,-0.131721,-0.12841,0.367791,-0.157296,-0.160873,0.381967,...,0.796595,0.761514,0.750052,0.743671,0.719313,0.736035,0.772624,0.783369,0.749037,0.765651
512,sub-87971197,9,SMC,0.414198,-0.167084,-0.160148,0.387119,-0.194502,-0.183695,0.395064,...,0.72836,0.675939,0.703368,0.681176,0.581422,0.633229,0.716224,0.758735,0.663426,0.728528
2093,sub-88059397,6,HEALTHY,0.346646,-0.127832,-0.125848,0.358515,-0.131966,-0.130152,0.336782,...,0.784685,0.697352,0.745172,0.70732,0.668015,0.728186,0.752003,0.772532,0.708964,0.771122
590,sub-87974841,3,HEALTHY,0.399027,-0.156863,-0.14461,0.301474,-0.094987,-0.096058,0.25127,...,0.775076,0.764817,0.751841,0.756274,0.716758,0.741123,0.811943,0.777369,0.768977,0.776461


### EO features

In [30]:
# define numeric columns
numeric_cols = [num_col for num_col in df_stat_conn_features_eo.columns if 'EO' in num_col]
 
X = df_stat_conn_features_eo[numeric_cols]
Y = df_stat_conn_features_eo['diagnosis']

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=pd.NA, strategy='mean')
X = imp_mean.fit_transform(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

np.int = np.int32 # to avoid error message, because installed Boruta package via conda instead of pip
np.float = np.float64
np.bool = np.bool_

clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=20)
trans = BorutaPy(clf, max_iter=100, random_state=42, verbose=1)
feature_selection = trans.fit_transform(X, Y)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [31]:
selected_columns = df_stat_conn_features_eo[numeric_cols].columns[trans.support_].tolist()  # get selected features
standard_columns = ['ID', 'epoch', 'diagnosis']  # columns to add
standard_columns.extend(selected_columns)  # add the columns
df_selected_stat_conn_feat_eo = df_stat_conn_features_eo[standard_columns]  # select the columns from the DataFrame
print(df_selected_stat_conn_feat_eo.shape)
df_selected_stat_conn_feat_eo.sample(5)

(2700, 276)


Unnamed: 0,ID,epoch,diagnosis,EO_l_frontal_theta_std,EO_l_frontal_theta_mean,EO_r_frontal_theta_std,EO_r_frontal_theta_mean,EO_r_frontal_theta_median,EO_l_central_theta_std,EO_l_central_theta_mean,...,EO_gamma_m_central-r_central,EO_gamma_m_central-l_posterior,EO_gamma_m_central-m_posterior,EO_gamma_m_central-r_posterior,EO_gamma_r_central-l_posterior,EO_gamma_r_central-m_posterior,EO_gamma_r_central-r_posterior,EO_gamma_l_posterior-m_posterior,EO_gamma_l_posterior-r_posterior,EO_gamma_m_posterior-r_posterior
2564,sub-88073521,9,ADHD,0.36904,-0.130609,0.333611,-0.103903,-0.108409,0.300973,-0.089251,...,0.926426,0.893795,0.9046,0.896193,0.882448,0.898397,0.906914,0.918846,0.907781,0.919347
44,sub-87966473,9,SMC,0.330724,-0.100442,0.298581,-0.095506,-0.097355,0.345872,-0.130806,...,0.77675,0.728496,0.754226,0.719271,0.702038,0.747659,0.781849,0.80568,0.743799,0.807446
97,sub-87967325,2,SMC,0.290488,-0.088545,0.324689,-0.110936,-0.106055,0.295522,-0.090128,...,0.857844,0.830164,0.842875,0.83896,0.792319,0.824158,0.865513,0.867315,0.842301,0.871122
302,sub-87969125,3,SMC,0.361798,-0.127621,0.361973,-0.130708,-0.133362,0.334124,-0.13081,...,0.757515,0.729445,0.759878,0.723909,0.668298,0.710833,0.75227,0.816734,0.733191,0.789267
1352,sub-88039057,9,MDD,0.287262,-0.090314,0.280382,-0.083377,-0.081038,0.288889,-0.096891,...,0.811852,0.78849,0.810385,0.761601,0.775973,0.805451,0.82195,0.821838,0.779955,0.811165


### Merge selected ratio, EC, EO features

In [32]:
# Drop the 'ID', 'epoch', 'diagnosis' columns from the second and third dataframes for concatenation
df_selected_stat_conn_feat_eo = df_selected_stat_conn_feat_eo.drop(['ID', 'epoch', 'diagnosis'], axis=1)
df_selected_stat_conn_feat_ratio = df_selected_stat_conn_feat_ratio.drop(['ID', 'epoch', 'diagnosis'], axis=1)

# Concatenate the dataframes
df_selected_stat_conn_features = pd.concat([df_selected_stat_conn_feat_ec, df_selected_stat_conn_feat_eo, df_selected_stat_conn_feat_ratio], axis=1)
df_selected_stat_conn_features

Unnamed: 0,ID,epoch,diagnosis,EC_r_central_delta_std,EC_r_central_delta_mean,EC_r_central_delta_median,EC_l_frontal_theta_std,EC_l_frontal_theta_mean,EC_l_frontal_theta_median,EC_m_frontal_theta_std,...,ratio_gamma_m_central-r_central,ratio_gamma_m_central-l_posterior,ratio_gamma_m_central-m_posterior,ratio_gamma_m_central-r_posterior,ratio_gamma_r_central-l_posterior,ratio_gamma_r_central-m_posterior,ratio_gamma_r_central-r_posterior,ratio_gamma_l_posterior-m_posterior,ratio_gamma_l_posterior-r_posterior,ratio_gamma_m_posterior-r_posterior
0,sub-87964717,1,SMC,0.396625,-0.174628,-0.177734,0.296711,-0.103983,-0.104952,0.316582,...,0.983375,0.982205,0.973501,0.957588,0.965101,0.975819,0.977929,0.973242,0.955600,0.985688
1,sub-87964717,2,SMC,0.394062,-0.158610,-0.156196,0.320765,-0.101048,-0.091661,0.319229,...,0.951962,0.999252,0.989583,0.969516,0.930350,0.938956,0.968311,0.982378,0.955147,0.965178
2,sub-87964717,3,SMC,0.410010,-0.196367,-0.187606,0.383631,-0.142807,-0.139421,0.369464,...,1.007210,1.003786,0.997969,0.981911,1.021233,1.027465,1.020506,0.977939,0.966586,0.985563
3,sub-87964717,4,SMC,0.469550,-0.230582,-0.225645,0.400738,-0.190151,-0.198076,0.349989,...,0.987823,1.029448,1.023404,1.011507,0.995531,1.025771,1.032580,0.990293,0.982638,1.006213
4,sub-87964717,5,SMC,0.355055,-0.131668,-0.134253,0.344149,-0.130853,-0.132630,0.345981,...,1.042865,1.021432,1.011774,1.023296,1.010551,1.047893,1.041591,1.019603,1.005809,1.024247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sub-88076717,8,OCD,0.474489,-0.223597,-0.222273,0.398381,-0.153613,-0.153747,0.385733,...,1.050150,1.093657,1.076960,1.060231,1.153361,1.122004,1.055451,1.082843,1.109466,1.075791
2696,sub-88076717,9,OCD,0.351207,-0.151888,-0.148171,0.280391,-0.083200,-0.082427,0.269471,...,1.095957,1.144254,1.111357,1.115784,1.178204,1.139658,1.100500,1.109036,1.142876,1.095822
2697,sub-88076717,10,OCD,0.367805,-0.150474,-0.144268,0.394338,-0.156866,-0.157289,0.364809,...,1.059378,1.114762,1.086334,1.146659,1.183257,1.136850,1.136010,1.085056,1.136426,1.107047
2698,sub-88076717,11,OCD,0.420107,-0.182264,-0.170567,0.321075,-0.114733,-0.113885,0.355278,...,1.048840,1.054833,1.083075,1.104262,1.037404,1.049401,1.063298,1.047424,1.064664,1.070277


In [33]:
df_selected_stat_conn_features.to_pickle(r'D:\Documents\RU\Master_Neurobiology\Internship_jaar_2\Project\TD-BRAIN\TD-BRAIN_extracted_features\df_selected_stat_conn_features.pkl')