# NUMERICAL PREPROCESSING

## Imports

In [12]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import KNNImputer

from scipy.io import loadmat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

## Load in Quant Data

In [9]:
file_path_trainQ = "data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx"
file_path_testQ = "data/TEST/TEST_QUANTITATIVE_METADATA.xlsx"
quant_train_df = pd.read_excel(file_path_trainQ)
quant_test_df = pd.read_excel(file_path_testQ)
print(quant_train_df.shape)
quant_train_df.head()

(1213, 19)


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,40.0,13,3,10,47,13,11,28,0,6,1,5,0,5,1,0,10,
1,CPaeQkhcjg7d,-94.47,14,3,13,34,18,23,30,0,18,6,8,7,8,10,4,5,
2,Nb4EetVPm3gs,-46.67,14,4,10,35,16,10,29,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,-26.68,10,5,12,39,19,16,28,6,24,4,16,9,10,8,4,6,
4,M09PXs7arQ5E,0.0,14,5,15,40,20,24,28,1,18,4,11,4,10,7,3,9,8.940679


In [10]:
print(quant_test_df.shape)
quant_test_df.head()

(304, 19)


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,60.03,14.0,5.0,16.0,41.0,19.0,11.0,26.0,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,86.71,12.0,3.0,13.0,43.0,18.0,15.0,28.0,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,26.68,13.0,3.0,14.0,36.0,16.0,14.0,25.0,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,93.38,13.0,3.0,19.0,41.0,17.0,18.0,27.0,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,-93.38,14.0,3.0,13.0,42.0,19.0,16.0,28.0,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [4]:
quant_train_df.isna().sum()

participant_id                  0
EHQ_EHQ_Total                   0
ColorVision_CV_Score            0
APQ_P_APQ_P_CP                  0
APQ_P_APQ_P_ID                  0
APQ_P_APQ_P_INV                 0
APQ_P_APQ_P_OPD                 0
APQ_P_APQ_P_PM                  0
APQ_P_APQ_P_PP                  0
SDQ_SDQ_Conduct_Problems        0
SDQ_SDQ_Difficulties_Total      0
SDQ_SDQ_Emotional_Problems      0
SDQ_SDQ_Externalizing           0
SDQ_SDQ_Generating_Impact       0
SDQ_SDQ_Hyperactivity           0
SDQ_SDQ_Internalizing           0
SDQ_SDQ_Peer_Problems           0
SDQ_SDQ_Prosocial               0
MRI_Track_Age_at_Scan         360
dtype: int64

In [11]:
quant_test_df.isna().sum()

participant_id                 0
EHQ_EHQ_Total                  1
ColorVision_CV_Score           9
APQ_P_APQ_P_CP                15
APQ_P_APQ_P_ID                15
APQ_P_APQ_P_INV               15
APQ_P_APQ_P_OPD               15
APQ_P_APQ_P_PM                15
APQ_P_APQ_P_PP                15
SDQ_SDQ_Conduct_Problems      30
SDQ_SDQ_Difficulties_Total    30
SDQ_SDQ_Emotional_Problems    30
SDQ_SDQ_Externalizing         30
SDQ_SDQ_Generating_Impact     30
SDQ_SDQ_Hyperactivity         30
SDQ_SDQ_Internalizing         30
SDQ_SDQ_Peer_Problems         30
SDQ_SDQ_Prosocial             30
MRI_Track_Age_at_Scan          0
dtype: int64

## Replace Null Values in Training Dataset with Median
Since the age distribution is slightly right skewed, we might be better off using a median imputation (the mean might be influenced by outliers).
Forward Fill is not ideal unless the data is strictly sequential (e.g., same subjects scanned multiple times).

In [5]:
quant_train_df.fillna({'MRI_Track_Age_at_Scan':quant_train_df['MRI_Track_Age_at_Scan'].median()}, inplace = True)

In [6]:
quant_train_df.isna().sum()

participant_id                0
EHQ_EHQ_Total                 0
ColorVision_CV_Score          0
APQ_P_APQ_P_CP                0
APQ_P_APQ_P_ID                0
APQ_P_APQ_P_INV               0
APQ_P_APQ_P_OPD               0
APQ_P_APQ_P_PM                0
APQ_P_APQ_P_PP                0
SDQ_SDQ_Conduct_Problems      0
SDQ_SDQ_Difficulties_Total    0
SDQ_SDQ_Emotional_Problems    0
SDQ_SDQ_Externalizing         0
SDQ_SDQ_Generating_Impact     0
SDQ_SDQ_Hyperactivity         0
SDQ_SDQ_Internalizing         0
SDQ_SDQ_Peer_Problems         0
SDQ_SDQ_Prosocial             0
MRI_Track_Age_at_Scan         0
dtype: int64

## Replace Null Values in Test Dataset with KNN Imputer

In [15]:
# Count missing values per row
quant_test_df["missing_count"] = quant_test_df.isnull().sum(axis=1)

# Display rows with high missingness
high_missingness = quant_test_df[quant_test_df["missing_count"] > 5]
print(len(high_missingness))
high_missingness


35


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,missing_count
5,3VbkvJ22j9Fu,95.65,12.0,,,,,,,,,,,,,,,,13.903718,15
8,uM4etVLZrgMg,100.05,14.0,3.0,13.0,42.0,20.0,13.0,26.0,,,,,,,,,,7.516541,9
20,9CH7UxXuznUa,72.3,14.0,3.0,17.0,40.0,19.0,14.0,25.0,,,,,,,,,,7.96281,9
32,GYQvyFy7QF7z,92.31,14.0,4.0,17.0,32.0,13.0,28.0,19.0,,,,,,,,,,18.54255,9
43,pJXKagbu6k5J,86.71,13.0,3.0,16.0,43.0,17.0,11.0,29.0,,,,,,,,,,8.794775,9
45,HynLNt7eOUu6,93.38,14.0,,,,,,,,,,,,,,,,10.342003,15
49,z0TcjaAHc8af,-75.64,14.0,,,,,,,,,,,,,,,,21.430412,15
52,LbvK4T5h6Bgg,100.05,14.0,3.0,14.0,34.0,25.0,10.0,24.0,,,,,,,,,,8.565708,9
61,S2aUm7iSCh8K,66.7,5.0,,,,,,,1.0,11.0,2.0,9.0,2.0,8.0,2.0,0.0,10.0,17.819415,6
63,WswtU2xjkwrJ,4.47,13.0,3.0,21.0,48.0,16.0,11.0,30.0,,,,,,,,,,7.139402,9


In [18]:
# merge quant_test_df and cat_test_df and see if it's at a specific location this happened
cat_test_df = pd.read_excel("data/TEST/TEST_CATEGORICAL.xlsx")

# merge the two dataframes
test_df = pd.merge(quant_test_df, cat_test_df, on="participant_id")

In [20]:
test_df['missing_count'] = test_df.isnull().sum(axis=1)
high_missingness = test_df[test_df["missing_count"] > 5]
print(len(high_missingness))
high_missingness

35


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,missing_count,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
5,3VbkvJ22j9Fu,95.65,12.0,,,,,,,,...,15,2023,4,0.0,1.0,4,21.0,45.0,18.0,45.0
8,uM4etVLZrgMg,100.05,14.0,3.0,13.0,42.0,20.0,13.0,26.0,,...,9,2023,4,1.0,2.0,4,18.0,40.0,18.0,5.0
20,9CH7UxXuznUa,72.3,14.0,3.0,17.0,40.0,19.0,14.0,25.0,,...,9,2022,4,0.0,3.0,4,21.0,45.0,21.0,35.0
32,GYQvyFy7QF7z,92.31,14.0,4.0,17.0,32.0,13.0,28.0,19.0,,...,9,2022,4,0.0,3.0,4,18.0,0.0,18.0,45.0
43,pJXKagbu6k5J,86.71,13.0,3.0,16.0,43.0,17.0,11.0,29.0,,...,9,2023,4,0.0,0.0,4,21.0,45.0,21.0,45.0
45,HynLNt7eOUu6,93.38,14.0,,,,,,,,...,15,2022,4,0.0,8.0,4,21.0,45.0,18.0,30.0
49,z0TcjaAHc8af,-75.64,14.0,,,,,,,,...,21,2022,4,,,4,,,,
52,LbvK4T5h6Bgg,100.05,14.0,3.0,14.0,34.0,25.0,10.0,24.0,,...,11,2023,4,0.0,0.0,4,18.0,15.0,,
61,S2aUm7iSCh8K,66.7,5.0,,,,,,,1.0,...,6,2023,4,1.0,0.0,4,18.0,30.0,18.0,30.0
63,WswtU2xjkwrJ,4.47,13.0,3.0,21.0,48.0,16.0,11.0,30.0,,...,9,2023,4,0.0,8.0,4,18.0,30.0,15.0,45.0


In [24]:
test_df['Basic_Demos_Study_Site'].value_counts()

Basic_Demos_Study_Site
4    301
5      3
Name: count, dtype: int64

In [23]:
high_missingness['Basic_Demos_Study_Site'].value_counts()

Basic_Demos_Study_Site
4    35
Name: count, dtype: int64

In [26]:
# Replace null values with knn imputer
knn_columns = quant_test_df.columns[quant_test_df.isna().any()].tolist()

imputer = KNNImputer(n_neighbors=5)

# Apply imputation to selected columns
quant_test_df[knn_columns] = imputer.fit_transform(quant_test_df[knn_columns])

# Verify missing values are handled
print(quant_test_df.isnull().sum())


participant_id                0
EHQ_EHQ_Total                 0
ColorVision_CV_Score          0
APQ_P_APQ_P_CP                0
APQ_P_APQ_P_ID                0
APQ_P_APQ_P_INV               0
APQ_P_APQ_P_OPD               0
APQ_P_APQ_P_PM                0
APQ_P_APQ_P_PP                0
SDQ_SDQ_Conduct_Problems      0
SDQ_SDQ_Difficulties_Total    0
SDQ_SDQ_Emotional_Problems    0
SDQ_SDQ_Externalizing         0
SDQ_SDQ_Generating_Impact     0
SDQ_SDQ_Hyperactivity         0
SDQ_SDQ_Internalizing         0
SDQ_SDQ_Peer_Problems         0
SDQ_SDQ_Prosocial             0
MRI_Track_Age_at_Scan         0
missing_count                 0
dtype: int64


In [27]:
quant_test_df

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,missing_count
0,Cfwaf5FX7jWK,60.03,14.0,5.0,16.0,41.0,19.0,11.0,26.0,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813,0
1,vhGrzmvA3Hjq,86.71,12.0,3.0,13.0,43.0,18.0,15.0,28.0,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093,0
2,ULliyEXjy4OV,26.68,13.0,3.0,14.0,36.0,16.0,14.0,25.0,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933,0
3,LZfeAb1xMtql,93.38,13.0,3.0,19.0,41.0,17.0,18.0,27.0,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814,0
4,EnFOUv0YK1RG,-93.38,14.0,3.0,13.0,42.0,19.0,16.0,28.0,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.261350,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,UadZfjdEg7eG,86.71,14.0,3.0,17.0,41.0,17.0,11.0,25.0,1.0,18.0,7.0,7.0,5.0,6.0,11.0,4.0,7.0,7.546999,0
300,IUEHiLmQAqCi,73.37,14.0,5.0,12.0,38.0,22.0,9.0,29.0,2.0,16.0,2.0,11.0,5.0,9.0,5.0,3.0,8.0,10.531143,0
301,cRySmCadYFRO,87.84,13.0,3.0,14.0,42.0,15.0,10.0,28.0,1.0,11.0,4.0,4.0,4.0,3.0,7.0,3.0,10.0,7.210586,0
302,E3MvDUtJadc5,46.76,14.0,3.0,16.0,43.0,18.0,12.0,22.0,5.0,21.0,2.0,10.0,6.0,5.0,11.0,9.0,0.0,12.212183,0


## Save to CSV File

In [28]:
# save to csv file
quant_train_df.to_csv("data/TRAIN/PREPROCESSED_QUANT_TRAIN_DATA.csv", index=False)
quant_test_df.to_csv("data/TEST/PREPROCESSED_QUANT_TEST_DATA.csv", index=False)