## IMPORTS

In [59]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from scipy.io import loadmat

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, accuracy_score

In [60]:
def multi_output_accuracy(y_true, y_pred):
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [61]:
multi_output_scorer = make_scorer(multi_output_accuracy)

## LOAD DATASETS

In [62]:
train_cat_df = pd.read_csv('data/TRAIN/PREPROCESSING_TRAIN_CATEGORICAL.csv')
test_cat_df = pd.read_csv('data/TEST/PREPROCESSING_TEST_CATEGORICAL.csv')

train_quant_df = pd.read_csv('data/TRAIN/PREPROCESSED_QUANT_TRAIN_DATA.csv')
test_quant_df = pd.read_csv('data/TEST/PREPROCESSED_QUANT_TEST_DATA.csv')

train_fcm_df = pd.read_csv('data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_fcm_df = pd.read_csv('data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')

train_Solutions = pd.read_excel('data/TRAIN/TRAINING_SOLUTIONS.xlsx')

## MERGE DATASETS

In [63]:
train_cat_FCM = pd.merge(train_cat_df, train_fcm_df, on = 'participant_id')
train_df = pd.merge(train_cat_FCM, train_quant_df, on = 'participant_id')
train_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,0,0,0,0,0,0,0,0,0,...,0,6,1,5,0,5,1,0,10,10.739219
1,CPaeQkhcjg7d,0,1,0,0,0,0,0,0,0,...,0,18,6,8,7,8,10,4,5,10.739219
2,Nb4EetVPm3gs,0,0,0,0,0,1,0,0,0,...,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,0,0,0,0,0,1,0,0,0,...,6,24,4,16,9,10,8,4,6,10.739219
4,M09PXs7arQ5E,1,0,0,0,0,0,0,0,0,...,1,18,4,11,4,10,7,3,9,8.940679


In [64]:
test_cat_FCM = pd.merge(test_cat_df, test_fcm_df, on = 'participant_id')
test_df = pd.merge(test_cat_FCM, test_quant_df, on = 'participant_id')
test_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [65]:
X_train = train_df.drop(columns = ['participant_id'])
Y_train = train_Solutions.drop(columns = ['participant_id'])
participant_id = test_df['participant_id']
X_test = test_df.drop(columns = 'participant_id')

## FEATURE SELECTION WITH LOGISTIC REGRESSION

In [66]:
model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(train_df.drop(columns='participant_id'), train_Solutions['Sex_F'])
selected_features_Sex = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
print(selected_features_Sex)

Index(['PreInt_Demos_Fam_Child_Race_1', 'PreInt_Demos_Fam_Child_Race_9',
       'Barratt_Barratt_P1_Edu_9', 'Barratt_Barratt_P1_Edu_18',
       'Barratt_Barratt_P1_Edu_21', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       'Barratt_Barratt_P1_Occ_35', 'Barratt_Barratt_P1_Occ_40',
       ...
       'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan'],
      dtype='object', length=390)


In [67]:
model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(train_df.drop(columns='participant_id'), train_Solutions['ADHD_Outcome'])
selected_features_ADHD = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
print(selected_features_ADHD)

Index(['PreInt_Demos_Fam_Child_Race_1', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_8', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_18',
       'Barratt_Barratt_P1_Edu_21', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       ...
       'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan'],
      dtype='object', length=278)


In [68]:
common_features = list(set(selected_features_ADHD) & set(selected_features_Sex))

In [69]:
X_train_2 = X_train[common_features]
X_test_2 = X_test[common_features]

## MODELING

In [70]:
xgb_classifier = XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5)
multioutput_classifier = MultiOutputClassifier(xgb_classifier)
multioutput_classifier.fit(X_train_2, Y_train)
y_pred_2 = multioutput_classifier.predict(X_test_2)

In [71]:
predictions_df_2 = pd.DataFrame(
    y_pred_2,
    columns=['ADHD_Outcome', 'Sex_F']
)

# Combine participant IDs with predictions
result_df_2 = pd.concat([participant_id.reset_index(drop=True), predictions_df_2], axis=1)
result_df_2.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,1,0
1,vhGrzmvA3Hjq,1,0
2,ULliyEXjy4OV,1,0
3,LZfeAb1xMtql,1,0
4,EnFOUv0YK1RG,1,0


In [72]:
cv_scores_2 = cross_val_score(multioutput_classifier, X_train_2, Y_train, cv=5, scoring=multi_output_scorer)

# Output the cross-validation results
print("Cross-validation scores for each fold:", cv_scores_2)
print("Mean CV score:", np.mean(cv_scores_2))

Cross-validation scores for each fold: [0.79012346 0.77572016 0.70576132 0.64876033 0.44008264]
Mean CV score: 0.6720895826956432


## SAVE RESULTS TO CSV

In [73]:
# save result_df_2 to csv
result_df_2.to_csv('data/RESULTS/RESULTS.csv', index=False)