In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score


train_path = "data/TRAIN"
test_path = "data/TEST"

train_metadata = pd.read_excel(os.path.join(train_path, "TRAIN_QUANTITATIVE_METADATA.xlsx"))  
train_categorical = pd.read_excel(os.path.join(train_path, "TRAIN_CATEGORICAL_METADATA.xlsx"))
train_functional = pd.read_csv(os.path.join(train_path, "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"))
train_solution = pd.read_excel(os.path.join(train_path, "TRAINING_SOLUTIONS.xlsx"))
train = train_metadata.merge(train_categorical, on='participant_id').merge(train_functional, on='participant_id').merge(train_solution, on='participant_id')

In [15]:
train.head()
#test.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,...,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984,1,1
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,...,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381,1,0
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,...,0.010771,-0.044341,0.128386,0.047282,0.087678,0.146221,-0.009425,0.03515,1,0
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,...,-0.007152,0.032584,0.121726,0.045089,0.154464,0.106817,0.065336,0.234708,1,1
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,...,-0.010196,0.035638,0.074978,0.030579,0.02564,0.118199,0.112522,0.143666,1,1


In [21]:
X = train.drop(columns=["ADHD_Outcome", "Sex_F", "participant_id"])  
y = train[["ADHD_Outcome", "Sex_F"]] 

X.fillna(X.median(), inplace=True)  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

multi_output_model = MultiOutputClassifier(rf)

multi_output_model.fit(X_train, y_train)

y_pred = multi_output_model.predict(X_val)

adhd_acc = accuracy_score(y_val["ADHD_Outcome"], y_pred[:, 0])
sex_acc = accuracy_score(y_val["Sex_F"], y_pred[:, 1])

print(f"ADHD Prediction Accuracy: {adhd_acc:.4f}")
print(f"Sex Prediction Accuracy: {sex_acc:.4f}")


ADHD Prediction Accuracy: 0.7366
Sex Prediction Accuracy: 0.7037


In [None]:
test_metadata = pd.read_excel(os.path.join(test_path, "TEST_QUANTITATIVE_METADATA.xlsx"))  
test_categorical = pd.read_excel(os.path.join(test_path, "TEST_CATEGORICAL.xlsx"))
test_functional = pd.read_csv(os.path.join(test_path, "TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv"))
test = test_metadata.merge(test_categorical, on='participant_id').merge(test_functional, on='participant_id')

In [None]:
participant_ids = test["participant_id"]

test.drop(columns=["participant_id"], inplace=True)

test.fillna(test.median(), inplace=True)
test_scaled = scaler.transform(test)

test_predictions = multi_output_model.predict(test_scaled)

submission = pd.DataFrame({
    "participant_id": participant_ids,
    "ADHD_Outcome": test_predictions[:, 0],  
    "Sex_F": test_predictions[:, 1]  
})

submission.to_csv("submission.csv", index=False)
