## IMPORTS

In [267]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

from scipy.io import loadmat

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [268]:
# file_prefix = "/kaggle/input/widsdatathon2025/"
file_prefix = "data/"

## LOAD CAT DATA

In [269]:
file_path_trainC = file_prefix + "TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx"
train_cat = pd.read_excel(file_path_trainC)
print(train_cat.shape)
train_cat.head()

(1213, 10)


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,00aIpNTbG5uh,2019,4,1.0,0.0,3.0,21.0,45.0,,
1,00fV0OyyoLfw,2017,1,0.0,9.0,2.0,21.0,0.0,21.0,45.0
2,04X1eiS79T4B,2017,1,1.0,2.0,2.0,9.0,0.0,,
3,05ocQutkURd6,2018,1,3.0,8.0,2.0,18.0,10.0,18.0,0.0
4,06YUNBA9ZRLq,2018,1,0.0,1.0,2.0,12.0,0.0,,


In [270]:
file_path_testC = file_prefix + "TEST/TEST_CATEGORICAL.xlsx"
test_cat = pd.read_excel(file_path_testC)
test_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,Cfwaf5FX7jWK,2022,4,0.0,0.0,4,21.0,30.0,18.0,30.0
1,vhGrzmvA3Hjq,2023,4,0.0,0.0,4,21.0,45.0,,30.0
2,ULliyEXjy4OV,2022,4,0.0,0.0,4,21.0,40.0,18.0,40.0
3,LZfeAb1xMtql,2022,4,0.0,0.0,3,21.0,45.0,21.0,45.0
4,EnFOUv0YK1RG,2022,4,2.0,0.0,4,18.0,0.0,21.0,45.0


In [271]:
file_path_trainS = file_prefix + "TRAIN/TRAINING_SOLUTIONS.xlsx"
train_Solutions = pd.read_excel(file_path_trainS)
print(train_Solutions.shape)
train_Solutions.head()

(1213, 3)


Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


## PREPROCESS CAT DATA:
`cat_train_final` and `cat_test_final`

In [272]:
train_cat.dtypes

participant_id                       object
Basic_Demos_Enroll_Year               int64
Basic_Demos_Study_Site                int64
PreInt_Demos_Fam_Child_Ethnicity    float64
PreInt_Demos_Fam_Child_Race         float64
MRI_Track_Scan_Location             float64
Barratt_Barratt_P1_Edu              float64
Barratt_Barratt_P1_Occ              float64
Barratt_Barratt_P2_Edu              float64
Barratt_Barratt_P2_Occ              float64
dtype: object

In [273]:
train_cat = train_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])

In [274]:
train_cat.dtypes

participant_id                       object
PreInt_Demos_Fam_Child_Ethnicity    float64
PreInt_Demos_Fam_Child_Race         float64
Barratt_Barratt_P1_Edu              float64
Barratt_Barratt_P1_Occ              float64
Barratt_Barratt_P2_Edu              float64
Barratt_Barratt_P2_Occ              float64
dtype: object

In [275]:
train_cat.isnull().sum()

participant_id                        0
PreInt_Demos_Fam_Child_Ethnicity     43
PreInt_Demos_Fam_Child_Race          54
Barratt_Barratt_P1_Edu               15
Barratt_Barratt_P1_Occ               31
Barratt_Barratt_P2_Edu              198
Barratt_Barratt_P2_Occ              222
dtype: int64

In [276]:
# train_cat.fillna({'Barratt_Barratt_P1_Edu':train_cat['Barratt_Barratt_P1_Edu'].median()}, inplace = True)
# train_cat.fillna({'Barratt_Barratt_P2_Edu':train_cat['Barratt_Barratt_P2_Edu'].median()}, inplace = True)

In [277]:
for col in train_cat.select_dtypes(include='float').columns:
    train_cat[col] = train_cat[col].astype('category')
# Creating a list of all of the columns except the first
columns_to_encode = train_cat.columns[1:].tolist()
# columns_to_encode.remove('Barratt_Barratt_P1_Edu')
# columns_to_encode.remove('Barratt_Barratt_P2_Edu')

# Print the columns to encode
print("Columns to encode:", columns_to_encode)
# encoding categorical data
train_encoded = pd.get_dummies(train_cat[columns_to_encode], dummy_na=True, drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# ethnicity_one_hot = pd.get_dummies(train_cat['PreInt_Demos_Fam_Child_Ethnicity'], prefix="PreInt_Demos_Fam_Child_Ethnicity", dummy_na=True)
# ethnicity_one_hot = ethnicity_one_hot.applymap(lambda x: 1 if x is True else (0 if x is False else x))
# ethnicity_one_hot = ethnicity_one_hot.rename(columns=lambda x: x.rstrip('.0'))
# ethnicity_one_hot = ethnicity_one_hot.rename(columns={"PreInt_Demos_Fam_Child_Ethnicity_": "PreInt_Demos_Fam_Child_Ethnicity_0"})

cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
# cat_train_final = pd.concat([cat_train_final, ethnicity_one_hot], axis=1)


Columns to encode: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


  train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [278]:
cat_train_final.columns

Index(['participant_id', 'PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_10.0',
       'PreInt_Demos_Fam_Child_Race_11.0', 'PreInt_Demos_Fam_Child_Race_nan',
       'Barratt_Barratt_P1_Edu_6.0', 'Barratt_Barratt_P1_Edu_9.0',
       'Barratt_Barratt_P1_Edu_12.0', 'Barratt_Barratt_P1_Edu_15.0',
       'Barratt_Barratt_P1_Edu_18.0', 'Barratt_Barratt_P1_Edu_21.0',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5.0',
       'Barratt_Barratt_P1_Occ_10.0', 'Barratt_Barratt_P1_Occ_15.0',
       'Barratt_Barratt_P1_Occ_20.0', 'Barratt_Barratt_P1_Occ_25.0

In [279]:
cat_train_final.isnull().sum().sum()

np.int64(0)

In [280]:
test_cat.dtypes

participant_id                       object
Basic_Demos_Enroll_Year               int64
Basic_Demos_Study_Site                int64
PreInt_Demos_Fam_Child_Ethnicity    float64
PreInt_Demos_Fam_Child_Race         float64
MRI_Track_Scan_Location               int64
Barratt_Barratt_P1_Edu              float64
Barratt_Barratt_P1_Occ              float64
Barratt_Barratt_P2_Edu              float64
Barratt_Barratt_P2_Occ              float64
dtype: object

In [281]:
test_cat = test_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])

In [282]:
# test_cat.fillna({'Barratt_Barratt_P1_Edu':test_cat['Barratt_Barratt_P1_Edu'].median()}, inplace = True)
# test_cat.fillna({'Barratt_Barratt_P2_Edu':test_cat['Barratt_Barratt_P2_Edu'].median()}, inplace = True)

In [283]:
test_cat.isnull().sum() 

participant_id                       0
PreInt_Demos_Fam_Child_Ethnicity     3
PreInt_Demos_Fam_Child_Race          6
Barratt_Barratt_P1_Edu               1
Barratt_Barratt_P1_Occ               1
Barratt_Barratt_P2_Edu              36
Barratt_Barratt_P2_Occ              42
dtype: int64

In [284]:
# convert our float variables to categories
for col in test_cat.select_dtypes(include='float').columns:
    test_cat[col] = test_cat[col].astype('category')

# Creating a list of all of the columns except the first
columns_to_encode = test_cat.columns[1:].tolist()
# columns_to_encode.remove('Barratt_Barratt_P1_Edu')
# columns_to_encode.remove('Barratt_Barratt_P2_Edu')

# Print the columns to encode
print("Columns to encode:", columns_to_encode)
# encoding categorical data
test_encoded = pd.get_dummies(test_cat[columns_to_encode], dummy_na=True, drop_first=True)
test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

cat_test_final = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)
# cat_train_final = pd.concat([cat_train_final, ethnicity_one_hot], axis=1)


# # Encode categorical variables in test
# test_encoded = pd.get_dummies(test_cat[columns_to_encode], dummy_na=True, drop_first=True)
# test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))
# test_encoded = test_encoded.rename(columns=lambda x: x.rstrip('.0'))
# cat_train_final_cols = cat_train_final.columns.tolist()
# cat_train_final_cols.remove('participant_id')

# missing_cols = set(cat_train_final_cols) - set(test_encoded.columns)
# print(len(missing_cols), "MISSING COLS")
# print(missing_cols)
# for col in missing_cols:
#     if col in test_encoded.columns:
#         print("COL IN TEST ENCODED")
#         print(col)
#     else:
#         test_encoded[col] = 0
# # Ensure test_encoded columns are in the same order as train_encoded
# test_encoded = test_encoded.reindex(columns=cat_train_final_cols, fill_value=0)

# # Combine encoded columns with the rest of the DataFrame
# cat_test_final = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)

Columns to encode: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


  test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [285]:
cat_test_final.columns

Index(['participant_id', 'PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_11.0',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_6.0',
       'Barratt_Barratt_P1_Edu_9.0', 'Barratt_Barratt_P1_Edu_12.0',
       'Barratt_Barratt_P1_Edu_15.0', 'Barratt_Barratt_P1_Edu_18.0',
       'Barratt_Barratt_P1_Edu_21.0', 'Barratt_Barratt_P1_Edu_nan',
       'Barratt_Barratt_P1_Occ_5.0', 'Barratt_Barratt_P1_Occ_15.0',
       'Barratt_Barratt_P1_Occ_20.0', 'Barratt_Barratt_P1_Occ_25.0',
       'Barratt_Barratt_P1_Occ_30.0', 'Barratt_Barratt_P1_Occ_35.0',
  

In [286]:
cat_test_final

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Race_7.0,...,Barratt_Barratt_P2_Occ_5.0,Barratt_Barratt_P2_Occ_10.0,Barratt_Barratt_P2_Occ_15.0,Barratt_Barratt_P2_Occ_20.0,Barratt_Barratt_P2_Occ_25.0,Barratt_Barratt_P2_Occ_30.0,Barratt_Barratt_P2_Occ_35.0,Barratt_Barratt_P2_Occ_40.0,Barratt_Barratt_P2_Occ_45.0,Barratt_Barratt_P2_Occ_nan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,EnFOUv0YK1RG,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,UadZfjdEg7eG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
300,IUEHiLmQAqCi,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
301,cRySmCadYFRO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
302,E3MvDUtJadc5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [287]:
cat_train_final.isnull().sum()

participant_id                          0
PreInt_Demos_Fam_Child_Ethnicity_1.0    0
PreInt_Demos_Fam_Child_Ethnicity_2.0    0
PreInt_Demos_Fam_Child_Ethnicity_3.0    0
PreInt_Demos_Fam_Child_Ethnicity_nan    0
PreInt_Demos_Fam_Child_Race_1.0         0
PreInt_Demos_Fam_Child_Race_2.0         0
PreInt_Demos_Fam_Child_Race_3.0         0
PreInt_Demos_Fam_Child_Race_4.0         0
PreInt_Demos_Fam_Child_Race_7.0         0
PreInt_Demos_Fam_Child_Race_8.0         0
PreInt_Demos_Fam_Child_Race_9.0         0
PreInt_Demos_Fam_Child_Race_10.0        0
PreInt_Demos_Fam_Child_Race_11.0        0
PreInt_Demos_Fam_Child_Race_nan         0
Barratt_Barratt_P1_Edu_6.0              0
Barratt_Barratt_P1_Edu_9.0              0
Barratt_Barratt_P1_Edu_12.0             0
Barratt_Barratt_P1_Edu_15.0             0
Barratt_Barratt_P1_Edu_18.0             0
Barratt_Barratt_P1_Edu_21.0             0
Barratt_Barratt_P1_Edu_nan              0
Barratt_Barratt_P1_Occ_5.0              0
Barratt_Barratt_P1_Occ_10.0       

In [288]:
cat_test_final.isnull().sum().sum()

np.int64(0)

## LOAD QUANT DATA

In [289]:
file_path_trainQ = file_prefix + "TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx"
file_path_testQ = file_prefix + "TEST/TEST_QUANTITATIVE_METADATA.xlsx"
quant_train_df = pd.read_excel(file_path_trainQ)
quant_test_df = pd.read_excel(file_path_testQ)

## PREPROCESS QUANT DATA:
`quant_train_df` and `quant_test_df`

In [290]:
quant_train_df.fillna({'MRI_Track_Age_at_Scan':quant_train_df['MRI_Track_Age_at_Scan'].median()}, inplace = True)

In [291]:
quant_train_df.isnull().sum()

participant_id                 0
EHQ_EHQ_Total                 13
ColorVision_CV_Score          23
APQ_P_APQ_P_CP                12
APQ_P_APQ_P_ID                12
APQ_P_APQ_P_INV               12
APQ_P_APQ_P_OPD               12
APQ_P_APQ_P_PM                12
APQ_P_APQ_P_PP                12
SDQ_SDQ_Conduct_Problems       9
SDQ_SDQ_Difficulties_Total     9
SDQ_SDQ_Emotional_Problems     9
SDQ_SDQ_Externalizing          9
SDQ_SDQ_Generating_Impact      9
SDQ_SDQ_Hyperactivity          9
SDQ_SDQ_Internalizing          9
SDQ_SDQ_Peer_Problems          9
SDQ_SDQ_Prosocial              9
MRI_Track_Age_at_Scan          0
dtype: int64

In [292]:
null_columns_to_fill = [
    "EHQ_EHQ_Total",
    "ColorVision_CV_Score",
    "APQ_P_APQ_P_CP",
    "APQ_P_APQ_P_ID",
    "APQ_P_APQ_P_INV",
    "APQ_P_APQ_P_OPD",
    "APQ_P_APQ_P_PM",
    "APQ_P_APQ_P_PP",
    "SDQ_SDQ_Conduct_Problems",
    "SDQ_SDQ_Difficulties_Total",
    "SDQ_SDQ_Emotional_Problems",
    "SDQ_SDQ_Externalizing",
    "SDQ_SDQ_Generating_Impact",
    "SDQ_SDQ_Hyperactivity",
    "SDQ_SDQ_Internalizing",
    "SDQ_SDQ_Peer_Problems",
    "SDQ_SDQ_Prosocial",
]

for column in null_columns_to_fill:
    quant_train_df[column].fillna(quant_train_df[column].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  quant_train_df[column].fillna(quant_train_df[column].mean(), inplace=True)


In [293]:
assert quant_train_df.isnull().sum().sum() == 0

In [294]:
knn_columns = quant_test_df.columns[quant_test_df.isna().any()].tolist()

imputer = KNNImputer(n_neighbors=5)

# Apply imputation to selected columns
quant_test_df[knn_columns] = imputer.fit_transform(quant_test_df[knn_columns])

# Verify missing values are handled
print(quant_test_df.isnull().sum())

participant_id                0
EHQ_EHQ_Total                 0
ColorVision_CV_Score          0
APQ_P_APQ_P_CP                0
APQ_P_APQ_P_ID                0
APQ_P_APQ_P_INV               0
APQ_P_APQ_P_OPD               0
APQ_P_APQ_P_PM                0
APQ_P_APQ_P_PP                0
SDQ_SDQ_Conduct_Problems      0
SDQ_SDQ_Difficulties_Total    0
SDQ_SDQ_Emotional_Problems    0
SDQ_SDQ_Externalizing         0
SDQ_SDQ_Generating_Impact     0
SDQ_SDQ_Hyperactivity         0
SDQ_SDQ_Internalizing         0
SDQ_SDQ_Peer_Problems         0
SDQ_SDQ_Prosocial             0
MRI_Track_Age_at_Scan         0
dtype: int64


## LOAD FCM DATA AND SOLUTIONS

In [295]:
train_fcm_df = pd.read_csv(file_prefix + 'TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_fcm_df = pd.read_csv(file_prefix + 'TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')

train_Solutions = pd.read_excel(file_prefix + 'TRAIN/TRAINING_SOLUTIONS.xlsx')

In [296]:
train_fcm_df.isnull().sum().sum()

np.int64(0)

## MERGE DATASETS

In [297]:
cat_train_final.isnull().sum().sum()

np.int64(0)

In [298]:
train_fcm_df.isnull().sum().sum()

np.int64(0)

In [299]:
quant_train_df.isnull().sum().sum()

np.int64(0)

In [300]:
train_cat_FCM = pd.merge(cat_train_final, train_fcm_df, on = 'participant_id')
train_df = pd.merge(train_cat_FCM, quant_train_df, on = 'participant_id')
train_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Race_7.0,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,00aIpNTbG5uh,1,0,0,0,0,0,0,0,0,...,3.0,17.0,4.0,11.0,5.0,8.0,6.0,2.0,9.0,14.274127
1,00fV0OyyoLfw,0,0,0,0,0,0,0,0,0,...,5.0,20.0,4.0,13.0,5.0,8.0,7.0,3.0,8.0,10.739219
2,04X1eiS79T4B,1,0,0,0,0,1,0,0,0,...,3.0,24.0,7.0,10.0,10.0,7.0,14.0,7.0,7.0,13.463381
3,05ocQutkURd6,0,0,1,0,0,0,0,0,0,...,0.0,5.0,0.0,3.0,0.0,3.0,2.0,2.0,6.0,9.572553
4,06YUNBA9ZRLq,0,0,0,0,1,0,0,0,0,...,6.0,23.0,7.0,15.0,8.0,9.0,8.0,1.0,4.0,6.654574


In [301]:
train_df.isnull().sum().sum()

np.int64(0)

In [302]:
test_cat_FCM = pd.merge(cat_test_final, test_fcm_df, on = 'participant_id')
test_df = pd.merge(test_cat_FCM, quant_test_df, on = 'participant_id')
test_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Race_7.0,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,0,1,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [303]:
X_train = train_df.drop(columns = ['participant_id'])
Y_train = train_Solutions.drop(columns = ['participant_id'])
participant_id = test_df['participant_id']
X_test = test_df.drop(columns = 'participant_id')

In [304]:
X_train.isnull().sum().sum()

np.int64(0)

## FEATURE SELECTION WITH LOGISTIC REGRESSION

In [305]:
# model = LogisticRegression(penalty='l1', solver='liblinear')
# model.fit(train_df.drop(columns='participant_id'), train_Solutions['Sex_F'])
# selected_features_Sex = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
# print(selected_features_Sex)

In [306]:
# model = LogisticRegression(penalty='l1', solver='liblinear')
# model.fit(train_df.drop(columns='participant_id'), train_Solutions['ADHD_Outcome'])
# selected_features_ADHD = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
# print(selected_features_ADHD)

In [307]:
# common_features = list(set(selected_features_ADHD) & set(selected_features_Sex))
# X_train_2 = X_train[common_features]
# X_test_2 = X_test[common_features]

In [308]:
Y_train.columns

Index(['ADHD_Outcome', 'Sex_F'], dtype='object')

In [309]:
X_train.columns

Index(['PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       ...
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan'],
      dtype='object', length=19966)

# TEST MODELING

In [310]:
# Sample 1100 randomw rows from X_train
random_indices = np.random.choice(X_train.index, 1100, replace=False)
X_train_temp = X_train.loc[random_indices]
Y_train_temp = Y_train.loc[random_indices]

# get all the rows in X_train that arent in X_train_temp
X_test_temp = X_train.drop(X_train_temp.index)
Y_test_temp = Y_train.drop(Y_train_temp.index)

scaler = StandardScaler()
X_train_temp_scaled = scaler.fit_transform(X_train_temp)
X_test_temp_scaled = scaler.fit_transform(X_test_temp)

In [311]:
X_test_temp.isnull().sum().sum()

np.int64(0)

In [312]:
X_train_temp.isnull().sum().sum()

np.int64(0)

**Parameter definitions:**
1. C - Used to prevent underfitting/overfitting
    - The default, which works well mostly, is C = 1
    - Small C prevents overfitting
    - Large C prevents underfitting
    - Going too high with C can cause overfitting while going too low causes underfitting
2. Solver - Specifies the optimization algorithm used to minimize the loss function (there are many solvers, each used in different cases depending on what the dataset looks like)
    - lbfgs (default) -> Good for small to medium data sets, doesn't support l1
    - liblinear -> Good for small, sparse (many zeros/missing data) datasets
    - saga -> Good for large datasets
    - etc (there are more, but they all deal with large datasets)
    - Since our dataset is very small, we can test the first 2
3. Penalty - Adds a constraint on the model to prevent overfitting
    - l2 (default)
    - l1 -> Used when feature selection occured (essentially when there's many zeros)

### Not Scaled - Sex

In [314]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Optimization solvers
    'penalty': ['l2'],  # Regularization type (l1 only works with liblinear)
    'max_iter': [200, 400, 800]  # Maximum number of iterations
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp, Y_train_temp["Sex_F"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy Score: 0.6272727272727272


### Scaled - Sex

In [315]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2'],  # Regularization type (l1 only works with liblinear)
    'max_iter': [200, 400, 800]  # Maximum number of iterations
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp_scaled, Y_train_temp["Sex_F"]) # Testing with scaled data

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)



Best Parameters: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Accuracy Score: 0.5809090909090909


In [318]:
X_test.columns

Index(['PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       ...
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan'],
      dtype='object', length=19964)

In [265]:
sex_f1_score = f1_score(Y_test_temp["Sex_F"], Sex_pred)
sex_f1_score

0.40860215053763443

### ADHD - Not Scaled

In [322]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2'],  # Regularization type (l1 only works with liblinear)
    'max_iter': [200, 400, 800]  # Maximum number of iterations
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp, Y_train_temp["ADHD_Outcome"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy Score: 0.6372727272727272


### ADHD - Scaled

In [323]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2'],  # Regularization type (l1 only works with liblinear)
    'max_iter': [200, 400, 800]  # Maximum number of iterations
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp_scaled, Y_train_temp["ADHD_Outcome"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)



Best Parameters: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Accuracy Score: 0.6063636363636364


Best Parameters: {'C': 0.01, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}

In [325]:
# X_test['Barratt_Barratt_P1_Occ_10.0'] = 0
# X_test['PreInt_Demos_Fam_Child_Race_10.0'] = 0

In [333]:
occ_10_index = X_test.columns.get_loc('Barratt_Barratt_P1_Occ_15.0')
race_10_index = X_test.columns.get_loc('PreInt_Demos_Fam_Child_Race_11.0')

In [334]:
occ_10_index

21

In [335]:
race_10_index

11

In [338]:
X_test

Unnamed: 0,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Race_7.0,PreInt_Demos_Fam_Child_Race_8.0,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,0,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,0,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,0,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,0,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,0,1,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.261350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,0,0,0,0,0,0,0,0,0,0,...,1.0,18.0,7.0,7.0,5.0,6.0,11.0,4.0,7.0,7.546999
300,0,0,0,0,1,0,0,0,0,0,...,2.0,16.0,2.0,11.0,5.0,9.0,5.0,3.0,8.0,10.531143
301,0,0,0,0,0,0,0,0,0,1,...,1.0,11.0,4.0,4.0,4.0,3.0,7.0,3.0,10.0,7.210586
302,0,0,0,0,0,0,0,0,0,1,...,5.0,21.0,2.0,10.0,6.0,5.0,11.0,9.0,0.0,12.212183


In [339]:
new_columns = X_test.columns.tolist()
new_columns.insert(occ_10_index, 'Barratt_Barratt_P1_Occ_10.0')
new_columns.insert(race_10_index, 'PreInt_Demos_Fam_Child_Race_10.0')
X_test = X_test.reindex(columns=new_columns, fill_value=0)

In [340]:
X_test.head()

Unnamed: 0,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Race_7.0,PreInt_Demos_Fam_Child_Race_8.0,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,0,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,0,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,0,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,0,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,0,1,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [341]:
model_Sex = LogisticRegression(C = 0.01, penalty='l2', max_iter=200, solver = 'liblinear')
model_Sex.fit(X_train, Y_train["Sex_F"])
Sex_pred = model_Sex.predict(X_test)

Best Parameters: {'C': 0.01, 'penalty': 'l2', max_iter = 100, 'solver': 'saga'}

In [342]:
model_ADHD = LogisticRegression(C = 0.01, penalty='l2', max_iter=100, solver = 'saga')
model_ADHD.fit(X_train, Y_train["ADHD_Outcome"])
ADHD_pred = model_ADHD.predict(X_test)



In [343]:
# ADHD_accuracy = accuracy_score(Y_test_temp["ADHD_Outcome"], ADHD_pred)
# Sex_accuracy = accuracy_score(Y_test_temp["Sex_F"], Sex_pred)

# print("ADHD Accuracy: ", ADHD_accuracy)
# print("Sex Accuracy: ", Sex_accuracy)

ValueError: Found input variables with inconsistent numbers of samples: [113, 304]

In [None]:
# Generating F1 scores

ADHD_f1 = f1_score(Y_test_temp["ADHD_Outcome"], ADHD_pred)
Sex_f1 = f1_score(Y_test_temp["Sex_F"], Sex_pred)

print("ADHD F1 Score: ", ADHD_f1)
print("Sex F1 Score: ", Sex_f1)

ADHD F1 Score:  0.8765432098765432
Sex F1 Score:  0.6458333333333334


### Accuracy With Scaling:

ADHD Accuracy:  0.3893805309734513

Sex Accuracy:  0.7699115044247787


### Accuracy Without Scaling:

ADHD Accuracy:  0.6017699115044248

Sex Accuracy:  0.6460176991150443

## MODELING

In [None]:
test_df_numeric = test_df.select_dtypes(include=['number'])
test_df_scaled = scaler.fit_transform(test_df_numeric)

ADHD_pred = model_ADHD.predict(test_df_numeric)
Sex_pred = model_Sex.predict(test_df_scaled)



In [344]:
final = pd.concat([test_df["participant_id"], pd.Series(ADHD_pred), pd.Series(Sex_pred)], axis = 1)
final.columns = ["participant_id", "ADHD_Outcome", "Sex_F"]
final

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,1,0
1,vhGrzmvA3Hjq,1,1
2,ULliyEXjy4OV,1,0
3,LZfeAb1xMtql,1,1
4,EnFOUv0YK1RG,1,0
...,...,...,...
299,UadZfjdEg7eG,1,0
300,IUEHiLmQAqCi,1,0
301,cRySmCadYFRO,1,1
302,E3MvDUtJadc5,1,0


In [345]:
final.to_csv("results.csv", index = False)