## IMPORTS

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

from scipy.io import loadmat

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
# file_prefix = "/kaggle/input/widsdatathon2025/"
file_prefix = "data/"

## LOAD CAT DATA

In [None]:
file_path_trainC = file_prefix + "TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx"
train_cat = pd.read_excel(file_path_trainC)
print(train_cat.shape)
train_cat.head()

(1213, 10)


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


In [None]:
file_path_testC = file_prefix + "TEST/TEST_CATEGORICAL.xlsx"
test_cat = pd.read_excel(file_path_testC)
test_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,Cfwaf5FX7jWK,2022,4,0.0,0.0,4,21.0,30.0,18.0,30.0
1,vhGrzmvA3Hjq,2023,4,0.0,0.0,4,21.0,45.0,,30.0
2,ULliyEXjy4OV,2022,4,0.0,0.0,4,21.0,40.0,18.0,40.0
3,LZfeAb1xMtql,2022,4,0.0,0.0,3,21.0,45.0,21.0,45.0
4,EnFOUv0YK1RG,2022,4,2.0,0.0,4,18.0,0.0,21.0,45.0


In [None]:
file_path_trainS = file_prefix + "TRAIN/TRAINING_SOLUTIONS.xlsx"
train_Solutions = pd.read_excel(file_path_trainS)
print(train_Solutions.shape)
train_Solutions.head()

(1213, 3)


Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


## PREPROCESS CAT DATA:
`cat_train_final` and `cat_test_final`

In [None]:
train_cat = train_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])
for col in train_cat.select_dtypes(include='int').columns:
    train_cat[col] = train_cat[col].astype('category')
# Creating a list of all of the columns except the first
columns_to_encode = train_cat.columns[1:].tolist()

# Print the columns to encode
print("Columns to encode:", columns_to_encode)
# encoding categorical data
train_encoded = pd.get_dummies(train_cat[columns_to_encode], dummy_na=True, drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

ethnicity_one_hot = pd.get_dummies(train_cat['PreInt_Demos_Fam_Child_Ethnicity'], prefix="PreInt_Demos_Fam_Child_Ethnicity", dummy_na=True)
ethnicity_one_hot = ethnicity_one_hot.applymap(lambda x: 1 if x is True else (0 if x is False else x))
ethnicity_one_hot = ethnicity_one_hot.rename(columns=lambda x: x.rstrip('.0'))
ethnicity_one_hot = ethnicity_one_hot.rename(columns={"PreInt_Demos_Fam_Child_Ethnicity_": "PreInt_Demos_Fam_Child_Ethnicity_0"})

cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
cat_train_final = pd.concat([cat_train_final, ethnicity_one_hot], axis=1)
cat_train_final.drop(columns=['PreInt_Demos_Fam_Child_Ethnicity'], inplace=True)

Columns to encode: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


  train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))
  ethnicity_one_hot = ethnicity_one_hot.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [None]:
test_cat = test_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])
# convert our int variables to categories
for col in test_cat.select_dtypes(include='float').columns:
    test_cat[col] = test_cat[col].astype('category')
# Encode categorical variables in test
test_encoded = pd.get_dummies(test_cat[columns_to_encode], dummy_na=True, drop_first=True)
test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))
test_encoded = test_encoded.rename(columns=lambda x: x.rstrip('.0'))
cat_train_final_cols = cat_train_final.columns.tolist()
cat_train_final_cols.remove('participant_id')

missing_cols = set(cat_train_final_cols) - set(test_encoded.columns)
print(len(missing_cols), "MISSING COLS")
print(missing_cols)
for col in missing_cols:
    if col in test_encoded.columns:
        print("COL IN TEST ENCODED")
        print(col)
    else:
        test_encoded[col] = 0
# Ensure test_encoded columns are in the same order as train_encoded
test_encoded = test_encoded.reindex(columns=cat_train_final_cols, fill_value=0)

# Combine encoded columns with the rest of the DataFrame
cat_test_final = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)

12 MISSING COLS
{'Barratt_Barratt_P2_Edu_3', 'Barratt_Barratt_P1_Occ_40', 'PreInt_Demos_Fam_Child_Race_10', 'Barratt_Barratt_P1_Edu_3', 'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P2_Occ_40', 'Barratt_Barratt_P2_Occ_20', 'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_10', 'PreInt_Demos_Fam_Child_Ethnicity_0', 'Barratt_Barratt_P2_Occ_10', 'Barratt_Barratt_P2_Occ_30'}


  test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [None]:
cat_train_final

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,UmrK0vMLopoR,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,CPaeQkhcjg7d,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Nb4EetVPm3gs,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,p4vPhVu91o4b,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,M09PXs7arQ5E,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208,Atx7oub96GXS,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1209,groSbUfkQngM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1210,zmxGvIrOD0bt,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1211,rOmWFuJCud5G,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0


## LOAD QUANT DATA

In [None]:
file_path_trainQ = file_prefix + "TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx"
file_path_testQ = file_prefix + "TEST/TEST_QUANTITATIVE_METADATA.xlsx"
quant_train_df = pd.read_excel(file_path_trainQ)
quant_test_df = pd.read_excel(file_path_testQ)

## PREPROCESS QUANT DATA:
`quant_train_df` and `quant_test_df`

In [None]:
quant_train_df.fillna({'MRI_Track_Age_at_Scan':quant_train_df['MRI_Track_Age_at_Scan'].median()}, inplace = True)

In [None]:
knn_columns = quant_test_df.columns[quant_test_df.isna().any()].tolist()

imputer = KNNImputer(n_neighbors=5)

# Apply imputation to selected columns
quant_test_df[knn_columns] = imputer.fit_transform(quant_test_df[knn_columns])

# Verify missing values are handled
print(quant_test_df.isnull().sum())

participant_id                0
EHQ_EHQ_Total                 0
ColorVision_CV_Score          0
APQ_P_APQ_P_CP                0
APQ_P_APQ_P_ID                0
APQ_P_APQ_P_INV               0
APQ_P_APQ_P_OPD               0
APQ_P_APQ_P_PM                0
APQ_P_APQ_P_PP                0
SDQ_SDQ_Conduct_Problems      0
SDQ_SDQ_Difficulties_Total    0
SDQ_SDQ_Emotional_Problems    0
SDQ_SDQ_Externalizing         0
SDQ_SDQ_Generating_Impact     0
SDQ_SDQ_Hyperactivity         0
SDQ_SDQ_Internalizing         0
SDQ_SDQ_Peer_Problems         0
SDQ_SDQ_Prosocial             0
MRI_Track_Age_at_Scan         0
dtype: int64


## LOAD FCM DATA AND SOLUTIONS

In [None]:
train_fcm_df = pd.read_csv(file_prefix + 'TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_fcm_df = pd.read_csv(file_prefix + 'TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')

train_Solutions = pd.read_excel(file_prefix + 'TRAIN/TRAINING_SOLUTIONS.xlsx')

In [None]:
train_fcm_df.isnull().sum().sum()

np.int64(0)

## MERGE DATASETS

In [None]:
train_cat_FCM = pd.merge(cat_train_final, train_fcm_df, on = 'participant_id')
train_df = pd.merge(train_cat_FCM, quant_train_df, on = 'participant_id')
train_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,0,0,0,0,0,0,0,0,0,...,0,6,1,5,0,5,1,0,10,10.739219
1,CPaeQkhcjg7d,0,1,0,0,0,0,0,0,0,...,0,18,6,8,7,8,10,4,5,10.739219
2,Nb4EetVPm3gs,0,0,0,0,0,1,0,0,0,...,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,0,0,0,0,0,1,0,0,0,...,6,24,4,16,9,10,8,4,6,10.739219
4,M09PXs7arQ5E,1,0,0,0,0,0,0,0,0,...,1,18,4,11,4,10,7,3,9,8.940679


In [None]:
test_cat_FCM = pd.merge(cat_test_final, test_fcm_df, on = 'participant_id')
test_df = pd.merge(test_cat_FCM, quant_test_df, on = 'participant_id')
test_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [None]:
X_train = train_df.drop(columns = ['participant_id'])
Y_train = train_Solutions.drop(columns = ['participant_id'])
participant_id = test_df['participant_id']
X_test = test_df.drop(columns = 'participant_id')

## FEATURE SELECTION WITH LOGISTIC REGRESSION

In [None]:
# model = LogisticRegression(penalty='l1', solver='liblinear')
# model.fit(train_df.drop(columns='participant_id'), train_Solutions['Sex_F'])
# selected_features_Sex = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
# print(selected_features_Sex)

In [None]:
# model = LogisticRegression(penalty='l1', solver='liblinear')
# model.fit(train_df.drop(columns='participant_id'), train_Solutions['ADHD_Outcome'])
# selected_features_ADHD = train_df.drop(columns='participant_id').columns[model.coef_[0] != 0]
# print(selected_features_ADHD)

In [None]:
# common_features = list(set(selected_features_ADHD) & set(selected_features_Sex))
# X_train_2 = X_train[common_features]
# X_test_2 = X_test[common_features]

In [None]:
Y_train.columns

Index(['ADHD_Outcome', 'Sex_F'], dtype='object')

In [None]:
X_train.columns

Index(['PreInt_Demos_Fam_Child_Race_1', 'PreInt_Demos_Fam_Child_Race_2',
       'PreInt_Demos_Fam_Child_Race_3', 'PreInt_Demos_Fam_Child_Race_4',
       'PreInt_Demos_Fam_Child_Race_7', 'PreInt_Demos_Fam_Child_Race_8',
       'PreInt_Demos_Fam_Child_Race_9', 'PreInt_Demos_Fam_Child_Race_10',
       'PreInt_Demos_Fam_Child_Race_11', 'PreInt_Demos_Fam_Child_Race_nan',
       ...
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan'],
      dtype='object', length=19969)

# TEST MODELING

In [None]:
# Sample 1100 randomw rows from X_train
random_indices = np.random.choice(X_train.index, 1100, replace=False)
X_train_temp = X_train.loc[random_indices]
Y_train_temp = Y_train.loc[random_indices]

# get all the rows in X_train that arent in X_train_temp
X_test_temp = X_train.drop(X_train_temp.index)
Y_test_temp = Y_train.drop(Y_train_temp.index)

scaler = StandardScaler()
X_train_temp_scaled = scaler.fit_transform(X_train_temp)
X_test_temp_scaled = scaler.fit_transform(X_test_temp)

In [None]:
X_test_temp.isnull().sum().sum()

np.int64(0)

**Parameter definitions:**
1. C - Used to prevent underfitting/overfitting
    - The default, which works well mostly, is C = 1
    - Small C prevents overfitting
    - Large C prevents underfitting
    - Going too high with C can cause overfitting while going too low causes underfitting
2. Solver - Specifies the optimization algorithm used to minimize the loss function (there are many solvers, each used in different cases depending on what the dataset looks like)
    - lbfgs (default) -> Good for small to medium data sets, doesn't support l1
    - liblinear -> Good for small, sparse (many zeros/missing data) datasets
    - saga -> Good for large datasets
    - etc (there are more, but they all deal with large datasets)
    - Since our dataset is very small, we can test the first 2
3. Penalty - Adds a constraint on the model to prevent overfitting
    - l2 (default)
    - l1 -> Used when feature selection occured (essentially when there's many zeros)

In [None]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear'],  # Optimization solvers
    'penalty': ['l2']  # Regularization type (l1 only works with liblinear)
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp, Y_train_temp["Sex_F"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

KeyboardInterrupt: 

In [None]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2']  # Regularization type (l1 only works with liblinear)
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp_scaled, Y_train_temp["Sex_F"]) # Testing with scaled data

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2']  # Regularization type (l1 only works with liblinear)
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp, Y_train_temp["ADHD_Outcome"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
# Grid Search for Opimizied Parameters
model = LogisticRegression()  # Increased iterations for convergence

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],  # Optimization solvers
    'penalty': ['l2']  # Regularization type (l1 only works with liblinear)
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_temp_scaled, Y_train_temp["ADHD_Outcome"])

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
model_Sex = LogisticRegression(C = 1, penalty='l2', solver = 'saga')
model_Sex.fit(X_train_temp, Y_train_temp["Sex_F"])
Sex_pred = model_Sex.predict(X_test_temp_scaled)



In [None]:
model_ADHD = LogisticRegression(C = 0.01, penalty='l2', solver = 'saga')
model_ADHD.fit(X_train_temp, Y_train_temp["ADHD_Outcome"])
ADHD_pred = model_ADHD.predict(X_test_temp)



In [None]:
ADHD_accuracy = accuracy_score(Y_test_temp["ADHD_Outcome"], ADHD_pred)
Sex_accuracy = accuracy_score(Y_test_temp["Sex_F"], Sex_pred)

print("ADHD Accuracy: ", ADHD_accuracy)
print("Sex Accuracy: ", Sex_accuracy)

ADHD Accuracy:  0.8230088495575221
Sex Accuracy:  0.6991150442477876


In [None]:
# Generating F1 scores

ADHD_f1 = f1_score(Y_test_temp["ADHD_Outcome"], ADHD_pred)
Sex_f1 = f1_score(Y_test_temp["Sex_F"], Sex_pred)

print("ADHD F1 Score: ", ADHD_f1)
print("Sex F1 Score: ", Sex_f1)

ADHD F1 Score:  0.8765432098765432
Sex F1 Score:  0.6458333333333334


### Accuracy With Scaling:

ADHD Accuracy:  0.3893805309734513

Sex Accuracy:  0.7699115044247787


### Accuracy Without Scaling:

ADHD Accuracy:  0.6017699115044248

Sex Accuracy:  0.6460176991150443

## MODELING

In [None]:
test_df_numeric = test_df.select_dtypes(include=['number'])
test_df_scaled = scaler.fit_transform(test_df_numeric)

ADHD_pred = model_ADHD.predict(test_df_numeric)
Sex_pred = model_Sex.predict(test_df_scaled)



In [None]:
final = pd.concat([test_df["participant_id"], pd.Series(ADHD_pred), pd.Series(Sex_pred)], axis = 1)
final.columns = ["participant_id", "ADHD_Outcome", "Sex_F"]
final

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,1,0
1,vhGrzmvA3Hjq,0,1
2,ULliyEXjy4OV,0,1
3,LZfeAb1xMtql,1,1
4,EnFOUv0YK1RG,1,1
...,...,...,...
299,UadZfjdEg7eG,1,1
300,IUEHiLmQAqCi,1,0
301,cRySmCadYFRO,0,1
302,E3MvDUtJadc5,1,0


In [None]:
final.to_csv("results.csv", index = False)