## Imports

In [77]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV

from scipy.io import loadmat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

## Load in dataframes

In [78]:
# categorical variable train dataframe

file_path_trainC = "data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx"
train_cat = pd.read_excel(file_path_trainC)
print(train_cat.shape)
train_cat.head()

(1213, 10)


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0


In [79]:
train_cat.columns

Index(['participant_id', 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
       'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
       'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
       'Barratt_Barratt_P2_Occ'],
      dtype='object')

In [80]:
# ADHD and Sex solutions dataframe for model training

file_path_trainS = "data/TRAIN/TRAINING_SOLUTIONS.xlsx"
train_Solutions = pd.read_excel(file_path_trainS)
print(train_Solutions.shape)
train_Solutions.head()

(1213, 3)


Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [81]:
train_Solutions.columns

Index(['participant_id', 'ADHD_Outcome', 'Sex_F'], dtype='object')

# Preprocessing

## Pre-processed Categorical Columns


Our categorical columns include demographic data about the adolescent and parental information, such as occupation and education level.

In the provided dataset, these categorical columns have been preprocessed by assigning numerical values to the categories within each variable. For instance, in the Parent 1 Occupation column, the number 35 might represent roles such as nurse, skilled technician, medical technician, or counselor. This grouping reduces the number of unique responses for each variable, simplifying the dataset.

The integers currently representing categorical values can be cross-referenced in the provided data dictionary in [kaggle](https://www.kaggle.com/competitions/widsdatathon2025/data) to understand their corresponding categories.


## Dropping Irrelevent Columns

Not all columns in the categorical dataset are indicators of a person's gender or whether or not they have ADHD. For example, the columns 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', and 'MRI_Track_Scan_Location' have no real correlation to our labels.

In [82]:
train_cat = train_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])

## What does One - Hot Encoding do ?

One hot encoding creates a new binary column for each unique category within a variable. For example, in the Parent 1 Occupation column with 10 possible categories (e.g., 0, 5, 10, 15, 20, etc.), one-hot encoding will generate 10 new columns: Parent_1_Occupation_0, Parent_1_Occupation_5, Parent_1_Occupation_10, and so on.

Each of these new columns will contain boolean values (True or False). For instance, if a participant's parent 1 occupation falls into category 0, the Parent_1_Occupation_0 column will have a value of True, while the other columns for this variable will be False. The same logic applies to the other categories, ensuring each participant is appropriately represented in the dataset.


## Why use one hot encoding

Avoid Implying Ordinal Relationships


1.  If you encode categories using numbers directly (e.g., 0, 1, 2), the algorithm might interpret these numbers as having a meaningful order or scale, which could lead to incorrect assumptions.
2.  One-hot encoding eliminates this problem by assigning each category its own binary column, ensuring no ordinal relationship is implied.


Improve Algorithm Performance


*  Algorithms like logistic regression, decision trees, and neural networks often perform better with one-hot encoded data because it provides clear distinctions between categories.
*  Without one-hot encoding, algorithms might struggle to learn from categorical variables or produce biased results.





One hot encoding is only possible on categorical variables so the first step is to switch our integer representations into category type variables.

In [83]:
for col in train_cat.select_dtypes(include='int').columns:
    train_cat[col] = train_cat[col].astype('category')

Our first column of our dataset is our participant id. This is an indicator variable that identifies each patient. This column is very important as it will be the one used to merge all of our data frames together, but, we do not want to encode this column. So, we will create a list of all the columns except the first one and label those `columns_to_encode`.

In [84]:
# Creating a list of all of the columns except the first
columns_to_encode = train_cat.columns[1:].tolist()

# Print the columns to encode
print("Columns to encode:", columns_to_encode)

Columns to encode: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


`pd.get_dummies(train_cat[columns_to_enccode], drop_first=True)`:
* Converts selected categorical columns in train_cat into one-hot encoded columns creating binary (0 or 1) columns for each category.
* The `drop_first=True` parameter avoids the "dummy variable trap" by dropping the first category for each feature,reducing redundancy in the encoded data.

`data_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))`:

*   Iterates over every element in the data_encoded DataFrame and:
  - Converts `True` to 1 and `False` to 0.
  - Leaves all other values unchanged.

In [85]:
train_cat['PreInt_Demos_Fam_Child_Ethnicity'].value_counts()

PreInt_Demos_Fam_Child_Ethnicity
0.0    809
1.0    296
2.0     77
3.0     20
Name: count, dtype: int64

In [86]:
# encoding categorical data
train_encoded = pd.get_dummies(train_cat[columns_to_encode], dummy_na=True, drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

  train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [87]:
train_encoded.columns

Index(['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_10', 'PreInt_Demos_Fam_Child_Race_11',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_3',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P1_Occ_15',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_35',
       'Barratt_Barratt_P1_Occ_40', 'Barratt_Barratt_P1_Occ_45',
       'Barratt_Barratt_P1_Occ_nan', 'Barr

Reintroducing participant ID after encoding:

After encoding the categorical columns we will add back in the participant id column. The get dummies function will align the participants correctly with their respective feature variables because it respects the original index of the DataFrame, so row alignment is consistent. When concatenating `data_encoded` with the rest of the DataFrame `train_cat.drop(columns=columns_to_encode)`, the rows align because pandas automatically matches by index.

In [88]:
# get dummies for PreInt_Demos_Fam_Child_Ethnicity column
ethnicity_one_hot = pd.get_dummies(train_cat['PreInt_Demos_Fam_Child_Ethnicity'], prefix="PreInt_Demos_Fam_Child_Ethnicity", dummy_na=True)
ethnicity_one_hot = ethnicity_one_hot.applymap(lambda x: 1 if x is True else (0 if x is False else x))
ethnicity_one_hot

  ethnicity_one_hot = ethnicity_one_hot.applymap(lambda x: 1 if x is True else (0 if x is False else x))


Unnamed: 0,PreInt_Demos_Fam_Child_Ethnicity_0.0,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Ethnicity_nan
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
1208,1,0,0,0,0
1209,1,0,0,0,0
1210,0,1,0,0,0
1211,1,0,0,0,0


In [89]:
# rename ethnicity_one_hot column names to get rid of the ".0" at the end
ethnicity_one_hot = ethnicity_one_hot.rename(columns=lambda x: x.rstrip('.0'))
ethnicity_one_hot = ethnicity_one_hot.rename(columns={"PreInt_Demos_Fam_Child_Ethnicity_": "PreInt_Demos_Fam_Child_Ethnicity_0"})
ethnicity_one_hot

Unnamed: 0,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
1208,1,0,0,0,0
1209,1,0,0,0,0
1210,0,1,0,0,0
1211,1,0,0,0,0


In [90]:
# Combine encoded columns with the rest of the DataFrame
cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
cat_train_final = pd.concat([cat_train_final, ethnicity_one_hot], axis=1)

# ensure it looks correct
cat_train_final.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,...,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,UmrK0vMLopoR,0.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,CPaeQkhcjg7d,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Nb4EetVPm3gs,1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,p4vPhVu91o4b,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,M09PXs7arQ5E,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [91]:
cat_train_final.drop(columns=['PreInt_Demos_Fam_Child_Ethnicity'], inplace=True)

In [92]:
cat_train_final.isna().sum()

participant_id                          0
PreInt_Demos_Fam_Child_Race_1           0
PreInt_Demos_Fam_Child_Race_2           0
PreInt_Demos_Fam_Child_Race_3           0
PreInt_Demos_Fam_Child_Race_4           0
PreInt_Demos_Fam_Child_Race_7           0
PreInt_Demos_Fam_Child_Race_8           0
PreInt_Demos_Fam_Child_Race_9           0
PreInt_Demos_Fam_Child_Race_10          0
PreInt_Demos_Fam_Child_Race_11          0
PreInt_Demos_Fam_Child_Race_nan         0
Barratt_Barratt_P1_Edu_3                0
Barratt_Barratt_P1_Edu_6                0
Barratt_Barratt_P1_Edu_9                0
Barratt_Barratt_P1_Edu_12               0
Barratt_Barratt_P1_Edu_15               0
Barratt_Barratt_P1_Edu_18               0
Barratt_Barratt_P1_Edu_21               0
Barratt_Barratt_P1_Edu_nan              0
Barratt_Barratt_P1_Occ_5                0
Barratt_Barratt_P1_Occ_10               0
Barratt_Barratt_P1_Occ_15               0
Barratt_Barratt_P1_Occ_20               0
Barratt_Barratt_P1_Occ_25         

In [93]:
cat_train_final.columns

Index(['participant_id', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_10', 'PreInt_Demos_Fam_Child_Race_11',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_3',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P1_Occ_15',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_35',
       'Barratt_Barratt_P1_Occ_40', 'Barratt_Barratt_P1_Occ_45',
       'Barratt_Barratt_P1_Occ_nan', 'Barratt_Barratt_P2_Edu

### Train and Test Dataframes

For any machine learning model you need training data and test data. On our Kaggle data page, you'll find both the training and testing dataframes. We have just encoded the categorical dataframe for our training data. Now, we need to apply the same encoding steps to the categorical dataframe in the testing data. It's essential to ensure that any preprocessing done on the training data is also applied to the test data to ensure accurate model predictions.

Our test categorical dataframe is preprocessed the same way as our training data so we will follow the same steps to encode the dataframe.

In [94]:
# load in test categorical dataframe

file_path_testC = "data/TEST/TEST_CATEGORICAL.xlsx"
test_cat = pd.read_excel(file_path_testC)
test_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,Cfwaf5FX7jWK,2022,4,0.0,0.0,4,21.0,30.0,18.0,30.0
1,vhGrzmvA3Hjq,2023,4,0.0,0.0,4,21.0,45.0,,30.0
2,ULliyEXjy4OV,2022,4,0.0,0.0,4,21.0,40.0,18.0,40.0
3,LZfeAb1xMtql,2022,4,0.0,0.0,3,21.0,45.0,21.0,45.0
4,EnFOUv0YK1RG,2022,4,2.0,0.0,4,18.0,0.0,21.0,45.0


In [95]:
test_cat = test_cat.drop(columns=['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'MRI_Track_Scan_Location'])

In [96]:
test_cat['PreInt_Demos_Fam_Child_Ethnicity'].value_counts()
test_cat.isna().sum()

participant_id                       0
PreInt_Demos_Fam_Child_Ethnicity     3
PreInt_Demos_Fam_Child_Race          6
Barratt_Barratt_P1_Edu               1
Barratt_Barratt_P1_Occ               1
Barratt_Barratt_P2_Edu              36
Barratt_Barratt_P2_Occ              42
dtype: int64

In [97]:
test_cat['Barratt_Barratt_P1_Edu'].value_counts()

Barratt_Barratt_P1_Edu
21.0    170
18.0     95
15.0     27
12.0      7
9.0       2
6.0       1
3.0       1
Name: count, dtype: int64

In [98]:
test_cat.columns

Index(['participant_id', 'PreInt_Demos_Fam_Child_Ethnicity',
       'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
       'Barratt_Barratt_P2_Occ'],
      dtype='object')

Let's encode the `test_cat` dataset first.

In [99]:
# convert our int variables to categories
for col in test_cat.select_dtypes(include='float').columns:
    test_cat[col] = test_cat[col].astype('category')

In [100]:
# Encode categorical variables in test
test_encoded = pd.get_dummies(test_cat[columns_to_encode], dummy_na=True, drop_first=True)
test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

  test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [101]:
# rename column to get rid of the ".0" at the end
print(test_encoded.columns)
# test_encoded.drop(columns=['PreInt_Demos_Fam_Child_Ethnicity_1.0', 'PreInt_Demos_Fam_Child_Ethnicity_2.0', 'PreInt_Demos_Fam_Child_Ethnicity_3.0'], inplace=True)
test_encoded = test_encoded.rename(columns=lambda x: x.rstrip('.0'))
test_encoded.columns

Index(['PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_11.0',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_6.0',
       'Barratt_Barratt_P1_Edu_9.0', 'Barratt_Barratt_P1_Edu_12.0',
       'Barratt_Barratt_P1_Edu_15.0', 'Barratt_Barratt_P1_Edu_18.0',
       'Barratt_Barratt_P1_Edu_21.0', 'Barratt_Barratt_P1_Edu_nan',
       'Barratt_Barratt_P1_Occ_5.0', 'Barratt_Barratt_P1_Occ_15.0',
       'Barratt_Barratt_P1_Occ_20.0', 'Barratt_Barratt_P1_Occ_25.0',
       'Barratt_Barratt_P1_Occ_30.0', 'Barratt_Barratt_P1_Occ_35.0',
       'Barratt_Barr

Index(['PreInt_Demos_Fam_Child_Ethnicity_1',
       'PreInt_Demos_Fam_Child_Ethnicity_2',
       'PreInt_Demos_Fam_Child_Ethnicity_3',
       'PreInt_Demos_Fam_Child_Ethnicity_nan', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_11', 'PreInt_Demos_Fam_Child_Race_nan',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_15', 'Barratt_Barratt_P1_Occ_2',
       'Barratt_Barratt_P1_Occ_25', 'Barratt_Barratt_P1_Occ_3',
       'Barratt_Barratt_P1_Occ_35', 'Barratt_Barratt_P1_Occ_4',
       'Barratt_Barratt_P1_Occ_45', 'Bar

In [102]:
test_encoded.head()

Unnamed: 0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,...,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_1,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_2,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_3,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_4,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [103]:
print("TRAIN COLS", cat_train_final.columns)

TRAIN COLS Index(['participant_id', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_10', 'PreInt_Demos_Fam_Child_Race_11',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_3',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P1_Occ_15',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_35',
       'Barratt_Barratt_P1_Occ_40', 'Barratt_Barratt_P1_Occ_45',
       'Barratt_Barratt_P1_Occ_nan', 'Barratt_Bar

In [104]:
print("TEST COLS", test_encoded.columns)
if "PreInt_Demos_Fam_Child_Ethnicity_2" in test_encoded.columns:
    print("HIIIII")

TEST COLS Index(['PreInt_Demos_Fam_Child_Ethnicity_1',
       'PreInt_Demos_Fam_Child_Ethnicity_2',
       'PreInt_Demos_Fam_Child_Ethnicity_3',
       'PreInt_Demos_Fam_Child_Ethnicity_nan', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_11', 'PreInt_Demos_Fam_Child_Race_nan',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_15', 'Barratt_Barratt_P1_Occ_2',
       'Barratt_Barratt_P1_Occ_25', 'Barratt_Barratt_P1_Occ_3',
       'Barratt_Barratt_P1_Occ_35', 'Barratt_Barratt_P1_Occ_4',
       'Barratt_Barratt_P1_Occ

In [105]:
# Ensure test_encoded has the same columns as train_encoded
# select all cat_train_final columns but participant id
cat_train_final_cols = cat_train_final.columns.tolist()
cat_train_final_cols.remove('participant_id')

missing_cols = set(cat_train_final_cols) - set(test_encoded.columns)
print(len(missing_cols), "MISSING COLS")
print(missing_cols)
for col in missing_cols:
    if col in test_encoded.columns:
        print("COL IN TEST ENCODED")
        print(col)
    else:
        test_encoded[col] = 0

12 MISSING COLS
{'Barratt_Barratt_P1_Occ_40', 'PreInt_Demos_Fam_Child_Ethnicity_0', 'Barratt_Barratt_P2_Occ_30', 'Barratt_Barratt_P2_Occ_20', 'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Edu_3', 'Barratt_Barratt_P2_Occ_40', 'PreInt_Demos_Fam_Child_Race_10', 'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P2_Edu_3', 'Barratt_Barratt_P2_Occ_10', 'Barratt_Barratt_P1_Occ_20'}


In [106]:

# Ensure test_encoded columns are in the same order as train_encoded
test_encoded = test_encoded.reindex(columns=cat_train_final_cols, fill_value=0)

# Combine encoded columns with the rest of the DataFrame
cat_test_final = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)

cat_test_final.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [107]:
cat_test_final.columns

Index(['participant_id', 'PreInt_Demos_Fam_Child_Race_1',
       'PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_3',
       'PreInt_Demos_Fam_Child_Race_4', 'PreInt_Demos_Fam_Child_Race_7',
       'PreInt_Demos_Fam_Child_Race_8', 'PreInt_Demos_Fam_Child_Race_9',
       'PreInt_Demos_Fam_Child_Race_10', 'PreInt_Demos_Fam_Child_Race_11',
       'PreInt_Demos_Fam_Child_Race_nan', 'Barratt_Barratt_P1_Edu_3',
       'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9',
       'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15',
       'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21',
       'Barratt_Barratt_P1_Edu_nan', 'Barratt_Barratt_P1_Occ_5',
       'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P1_Occ_15',
       'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25',
       'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_35',
       'Barratt_Barratt_P1_Occ_40', 'Barratt_Barratt_P1_Occ_45',
       'Barratt_Barratt_P1_Occ_nan', 'Barratt_Barratt_P2_Edu

Now we have nearly complete train and test dataframes. There is one more step before we can perform machine learning, imputing NA values. Again, you will need to impute NA values the same way for both your training and test dataframes for accurate model performance.

## NA values

Before performing machine learning we must adress missing values to ensure optimal model performance.

We will demonstrate how to fill the missing (`NA`) values using the mean of each column as an example.

Note: There are many approaches to handle missing values, and the best method often depends on your dataset and chosen machine learning model. You can explore how your model deals with missing data and try alternative techniques. For example, this [website](https://www.widsworldwide.org/get-inspired/blog/a-data-scientists-deep-dive-into-the-wids-datathon/) has various ways to handle missing data.

Filling NA values is a key challenge in this datathon to get accurate model performance. Experiment with different methods and evaluate which approach gives the best results for your model!

In [108]:
# check how many NA values we have
print("NA TRAIN", cat_train_final.isna().sum().sum())
print("NA TEST", cat_test_final.isna().sum().sum())
# 371 NANs values
# 360 in MRI_Track_age_at_Scan
# 11 in PreInt_Demos_Fam_Child_Ethnicity

NA TRAIN 0
NA TEST 0


We can fill the missing (`NA`) values in the columns `MRI_Track_Age_at_Scan` and `PreInt_Demos_Fam_Child_Ethnicity` individually by replacing them with the mean of their respective columns.

In [109]:
# cat_train_final.fillna({'PreInt_Demos_Fam_Child_Ethnicity':int(cat_train_final['PreInt_Demos_Fam_Child_Ethnicity'].mean())}, inplace = True)

print(cat_train_final.isna().sum().sum()) # should now be zero

0


We can also use backward and forward fills. This indicates using previous or the following data row in order to replace the missing values.


In [110]:
# cat_train_final.ffill(inplace=True)
# print(cat_train_final.isna().sum().sum())

In [111]:
# # Fill NAs of test data

# for col in test_df.columns:
#     if test_df[col].isna().sum() > 0:  # Check if the column has NaN values
#         if test_df[col].dtype in ['float64', 'int64']:  # Ensure it's numeric
#             test_df[col] = test_df[col].fillna(test_df[col].mean())  # Avoid inplace
#         else:
#             print(f"Skipping non-numeric column: {col}")

In [112]:
# cat_train_final['PreInt_Demos_Fam_Child_Ethnicity'].value_counts()

Now that we have filled in our NA values our data frame is ready to be trained for machine learning. Before, we do that, let's see what the dataset looks like now.

In [113]:
cat_train_final

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,UmrK0vMLopoR,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,CPaeQkhcjg7d,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Nb4EetVPm3gs,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,p4vPhVu91o4b,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,M09PXs7arQ5E,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208,Atx7oub96GXS,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1209,groSbUfkQngM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1210,zmxGvIrOD0bt,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1211,rOmWFuJCud5G,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [114]:
cat_test_final

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Race_1,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_3,PreInt_Demos_Fam_Child_Race_4,PreInt_Demos_Fam_Child_Race_7,PreInt_Demos_Fam_Child_Race_8,PreInt_Demos_Fam_Child_Race_9,PreInt_Demos_Fam_Child_Race_10,PreInt_Demos_Fam_Child_Race_11,...,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45,Barratt_Barratt_P2_Occ_nan,PreInt_Demos_Fam_Child_Ethnicity_0,PreInt_Demos_Fam_Child_Ethnicity_1,PreInt_Demos_Fam_Child_Ethnicity_2,PreInt_Demos_Fam_Child_Ethnicity_3,PreInt_Demos_Fam_Child_Ethnicity_nan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,UadZfjdEg7eG,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
300,IUEHiLmQAqCi,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
301,cRySmCadYFRO,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
302,E3MvDUtJadc5,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [115]:
cat_train_final.to_csv("./data/TRAIN/PREPROCESSING_TRAIN_CATEGORICAL.csv", index=False)

In [116]:
cat_test_final.to_csv("./data/TEST/PREPROCESSING_TEST_CATEGORICAL.csv", index=False)