# 1. Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style="whitegrid")

# 2. Data Preprocessing

In [None]:
SEED = 42

train_q = pd.read_excel("data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
train_c = pd.read_excel("data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
test_q = pd.read_excel("data/TEST/TEST_QUANTITATIVE_METADATA.xlsx")
test_c = pd.read_excel("data/TEST/TEST_CATEGORICAL_METADATA.xlsx")

train_combined = pd.merge(train_q, train_c, on='participant_id', how='left').set_index("participant_id")
test_combined = pd.merge(test_q, test_c, on='participant_id', how='left').set_index("participant_id")

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train_combined = train_combined.sort_index()
labels = labels.sort_index()
assert all(train_combined.index == labels.index), "Label IDs do not match train IDs"

In [None]:
train_combined

In [None]:
train_combined.columns

In [None]:
label_df = labels.copy()

# Plot: ADHD_Outcome
plt.figure(figsize=(6, 4))
sns.countplot(x=label_df["ADHD_Outcome"], palette="Set2")
plt.title("Distribution of ADHD_Outcome")
plt.xticks([0, 1], ['No ADHD (0)', 'ADHD (1)'])
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Plot: Sex_F
plt.figure(figsize=(6, 4))
sns.countplot(x=label_df["Sex_F"], palette="pastel")
plt.title("Distribution of Sex_F")
plt.xticks([0, 1], ['Male (0)', 'Female (1)'])
plt.ylabel("Count")
plt.tight_layout()
plt.show()


This is some explanation of feature present in the dataset

## Numerical features: Total of 18 features
+ EHQ_EHQ_Total: laterality index score (float) || -100 = 10th left, −28 ≤ LI < 48 = middle, 100 = 10th right"
+ ColorVision_CV_Score: color vision test score (int)
+ MRI_Track_Age_at_Scan: Age at time of MRI scan (float)

### ALABAMA PARENTING QUESTIONAIRE - PARENT REPORT (INT)
+ APQ_P_APQ_P_CP: Reflects the frequency or severity of corporal punishment used by parents
+ APQ_P_APQ_P_ID: Measures inconsistency in parental discipline
+ APQ_P_APQ_P_INV: Indicates the level of parental involvement in the child’s life
+ APQ_P_APQ_P_OPD: Other Discipline Practices Score (Not factored into total score but provides item level information)
+ APQ_P_APQ_P_PM: Reflects how well a parent monitors and supervises their child
+ APQ_P_APQ_P_PP: Captures the extent of positive reinforcement and supportive parenting

### Strength and Difficulties Questionnaire (INT)
+ SDQ_SDQ_Conduct_Problems: Measures behavioral issues related to rule-breaking or aggression (higher score = more prone to ADHD)
+ SDQ_SDQ_Difficulties_Total: A composite measure summarizing overall difficulties across several behavioral domains
+ SDQ_SDQ_Emotional_Problems: Focuses on internal emotional difficulties such as anxiety or depression (social related)
+ SDQ_SDQ_Externalizing: Captures outward-directed behaviors such as hyperactivity, impulsivity, and conduct issues
+ SDQ_SDQ_Generating_Impact: This might reflect the overall impact of the child’s behavioral problems on their social and academic life
+ SDQ_SDQ_Hyperactivity: Directly measures the hyperactive and impulsive behaviors central to many ADHD diagnoses (HIGHLY CORRELATED FEATURE)
+ SDQ_SDQ_Internalizing: Reflects inward-focused behaviors such as social withdrawal and anxiety
+ SDQ_SDQ_Peer_Problems: Assesses difficulties in interacting with peers
+ SDQ_SDQ_Prosocial: Evaluates positive social behaviors like empathy and cooperation

## Features Informative For Predicting Sex (Based on domain knowledge only) (17 FEATURES)
+ EHQ_EHQ_Total (VERY IMPORTANT)
+ ColorVision_CV_Score (VERY IMPORTANT)

!! (APQ_ features) Differences in discipline or supervision could interact with gender expectations and thus be indirectly predictive of sex.
+ APQ_P_APQ_P_CP
+ APQ_P_APQ_P_ID
+ APQ_P_APQ_P_INV
+ APQ_P_APQ_P_OPD
+ APQ_P_APQ_P_PM
+ APQ_P_APQ_P_PP
+ SDQ_SDQ_Conduct_Problems
+ SDQ_SDQ_Difficulties_Total
+ SDQ_SDQ_Emotional_Problems
+ SDQ_SDQ_Externalizing
+ SDQ_SDQ_Generating_Impact
+ SDQ_SDQ_Hyperactivity
+ SDQ_SDQ_Internalizing
+ SDQ_SDQ_Peer_Problems
+ SDQ_SDQ_Prosocial

ALL APQ_ AND SDQ_ FEATURES ARE SOMEWHAT IMPORTANT

## Features Informative For Predicting ADHD (Based on domain knowledge only) (18 FEATURES)
+ EHQ_EHQ_Total (SOMEWHAT IMPORTANT)
+ ColorVision_CV_Score
+ MRI_Track_Age_at_Scan (VERY IMPORTANT)
+ APQ_P_APQ_P_CP
+ APQ_P_APQ_P_ID (SOMEWHAT IMPORTANT)
+ APQ_P_APQ_P_INV (VERY IMPORTANT)
+ APQ_P_APQ_P_OPD
+ APQ_P_APQ_P_PM (SOMEWHAT IMPORTANT)
+ APQ_P_APQ_P_PP (VERY IMPORTANT)
+ SDQ_SDQ_Conduct_Problems (SOMEWHAT IMPORTANT)
+ SDQ_SDQ_Difficulties_Total (SOMEWHAT IMPORTANT)
+ SDQ_SDQ_Emotional_Problems
+ SDQ_SDQ_Externalizing (VERY IMPORTANT)
+ SDQ_SDQ_Generating_Impact
+ SDQ_SDQ_Hyperactivity (VERY IMPORTANT)
+ SDQ_SDQ_Internalizing (VERY IMPORTANT)
+ SDQ_SDQ_Peer_Problems
+ SDQ_SDQ_Prosocial

## 2.1 Numerical Features Visualization

In [None]:
numerical_features = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP',
    'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD',
    'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
    'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
    'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing',
    'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan'
]

print(len(numerical_features))

In [None]:
numerical_features_df = train_combined[numerical_features]

def nan_summary(df, name):
    print(f"---- {name} NAN SUMMARY ----")
    print((df.isnull().sum() / len(df) * 100).sort_values(ascending=False), '\n')

nan_summary(numerical_features_df, "TRAIN NUMERICAL FEATURES")

### 2.1.1 EHQ_EHQ_Total

In [None]:
median_val = train_combined["EHQ_EHQ_Total"].median()
train_combined["EHQ_EHQ_Total"].fillna(median_val, inplace=True)
print("Missing values after fill:", train_combined["EHQ_EHQ_Total"].isna().sum())

median_test_val = test_combined["EHQ_EHQ_Total"].median()
test_combined["EHQ_EHQ_Total"].fillna(median_test_val, inplace=True)
print("Missing values after fill:", test_combined["EHQ_EHQ_Total"].isna().sum())

In [None]:
plt.figure(figsize=(10, 4))
sns.boxplot(data=train_combined, x='EHQ_EHQ_Total')
plt.title("Box Plot of EHQ_|EHQ_Total")
plt.xlabel("EHQ_EHQ_Total")
plt.show()

In [None]:
def categorize_laterality(li):
    if -110 <= li <= -28:
        return "Left-Lateralized"
    elif -28 < li <= 47:
        return 'Middle'
    elif 47 < li <= 110:
        return 'Right-Lateralized'

train_combined['Laterality_Category'] = train_combined['EHQ_EHQ_Total'].apply(categorize_laterality)
test_combined['Laterality_Category'] = test_combined['EHQ_EHQ_Total'].apply(categorize_laterality)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=train_combined, x="Laterality_Category", order=["Left-Lateralized", "Middle", "Right-Lateralized"])
plt.title("Distribution of Laterality Index Categories")
plt.xlabel("Brain Laterality")
plt.ylabel("Number of Participants")
plt.tight_layout()
plt.show()

### 2.1.2 ColorVision_CV_Score

In [None]:
train_combined['ColorVision_CV_Score'].fillna(14, inplace=True)
print("Missing values after fill:", train_combined["ColorVision_CV_Score"].isna().sum())

test_combined['ColorVision_CV_Score'].fillna(14, inplace=True)
print("Missing values after fill:", test_combined["ColorVision_CV_Score"].isna().sum())

In [None]:
plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined['ColorVision_CV_Score'], color="skyblue")
plt.title('Boxplot: ColorVision_CV_Score')
plt.xlabel('ColorVision_CV_Score')
plt.grid(True)
plt.show()

In [None]:
# train_combined['ColorVision_CV_Score'] = train_combined['ColorVision_CV_Score'].astype('category')
# test_combined['ColorVision_CV_Score'] = test_combined['ColorVision_CV_Score'].astype('category')

In [None]:
# def categorize_vision(score):
#     if score <= 13:
#         return 'not_strong'
#     else:
#         return 'strong'

# train_combined['ColorVision_Level'] = train_combined['ColorVision_CV_Score'].astype(float).apply(categorize_vision)
# test_combined['ColorVision_Level'] = test_combined['ColorVision_CV_Score'].astype(float).apply(categorize_vision)

In [None]:
# plt.figure(figsize=(6, 4))
# sns.countplot(data=train_combined, x='ColorVision_Level', order=['not_strong', 'strong'], palette='pastel')
# plt.title('Distribution of ColorVision Levels')
# plt.xlabel('Color Vision Category')
# plt.ylabel('Count')
# plt.grid(True)
# plt.show()

### 2.1.3 APQ_P_APQ_P_CP

In [None]:
nan_count = train_combined['APQ_P_APQ_P_CP'].isna().sum()
print(f"Number of NaN values in APQ_P_APQ_P_CP: {nan_count}")

median_cp = train_combined['APQ_P_APQ_P_CP'].median()
train_combined['APQ_P_APQ_P_CP'] = train_combined['APQ_P_APQ_P_CP'].fillna(median_cp)
print(f"Filled NaNs with median: {median_cp}")

median_test_cp = test_combined['APQ_P_APQ_P_CP'].median()
test_combined['APQ_P_APQ_P_CP'] = test_combined['APQ_P_APQ_P_CP'].fillna(median_test_cp)
print(f"Filled NaNs with median: {median_test_cp}")

In [None]:
cp_scores = train_combined['APQ_P_APQ_P_CP']
Q1 = cp_scores.quantile(0.25)
Q3 = cp_scores.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = cp_scores[(cp_scores < lower_bound) | (cp_scores > upper_bound)]
print(f"Number of outliers in APQ_P_APQ_P_CP: {len(outliers)}")
print("Outlier values:")
print(outliers.values)

plt.figure(figsize=(8, 4))
sns.boxplot(x=train_combined['APQ_P_APQ_P_CP'], orient='h', color='lightgreen')
plt.title('Boxplot of APQ_P_APQ_P_CP (After NaN Imputation)')
plt.xlabel('Corporal Punishment Score')
plt.tight_layout()
plt.show()

In [None]:
# train_combined['APQ_CP_is_high'] = (train_combined['APQ_P_APQ_P_CP'] > 5).astype(int)
# test_combined['APQ_CP_is_high'] = (test_combined['APQ_P_APQ_P_CP'] > 5).astype(int)

In [None]:
# plt.figure(figsize=(6, 4))
# sns.countplot(x=train_combined['APQ_CP_is_high'], palette="Set2")
# plt.title('Distribution of High Corporal Punishment Flag (APQ_CP_is_high)')
# plt.xlabel('Is High Corporal Punishment? (0 = No, 1 = Yes)')
# plt.ylabel('Count')
# plt.xticks([0, 1], ['0 = No', '1 = Yes'])
# plt.tight_layout()
# plt.show()

### 2.1.4: APQ_P_APQ_P_ID

In [None]:
feature = 'APQ_P_APQ_P_ID'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in {feature}: {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature].fillna(median_value, inplace=True)
print(f"Filled NaNs in {feature} with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature].fillna(median_test_value, inplace=True)
print(f"Filled NaNs in {feature} with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in {feature}: {len(outliers)}")

plt.figure(figsize=(8, 2))
sns.boxplot(x=train_combined[feature], orient='h')
plt.title(f'Boxplot of {feature}')
plt.show()

### 2.1.5: APQ_P_APQ_P_INV

In [None]:
feature = 'APQ_P_APQ_P_INV'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaNs in '{feature}' with median: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaNs in '{feature}' with median: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")
print(f"Outlier threshold: lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="skyblue")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.6: APQ_P_APQ_P_OPD

In [None]:
feature = 'APQ_P_APQ_P_OPD'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaNs in '{feature}' with median: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaNs in '{feature}' with median: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")
print(f"Outlier threshold: lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightgreen")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.7: APQ_P_APQ_P_PM

In [None]:
feature = 'APQ_P_APQ_P_PM'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaNs in '{feature}' with median: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaNs in '{feature}' with median: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")
print(f"Outlier threshold: lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="salmon")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.8 APQ_P_APQ_P_PP

In [None]:
feature = 'APQ_P_APQ_P_PP'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaNs in '{feature}' with median: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaNs in '{feature}' with median: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")
print(f"Outlier threshold: lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="plum")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.9 SDQ_SDQ_Conduct_Problems

In [None]:
feature = 'SDQ_SDQ_Conduct_Problems'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaNs in '{feature}' with median: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaNs in '{feature}' with median: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")
print(f"Outlier threshold: lower < {lower_bound:.2f}, upper > {upper_bound:.2f}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.10 SDQ_SDQ_Difficulties_Total

In [None]:
feature = 'SDQ_SDQ_Difficulties_Total'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.11: SDQ_SDQ_Emotional_Problems

In [None]:
feature = 'SDQ_SDQ_Emotional_Problems'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.12: SDQ_SDQ_Externalizing

In [None]:
feature = 'SDQ_SDQ_Externalizing'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.13: SDQ_SDQ_Generating_Impact

In [None]:
feature = 'SDQ_SDQ_Generating_Impact'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.14: SDQ_SDQ_Hyperactivity

In [None]:
feature = 'SDQ_SDQ_Hyperactivity'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.15: SDQ_SDQ_Internalizing

In [None]:
feature = 'SDQ_SDQ_Internalizing'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.16: SDQ_SDQ_Peer_Problems

In [None]:
feature = 'SDQ_SDQ_Peer_Problems'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.17: SDQ_SDQ_Prosocial

In [None]:
feature = 'SDQ_SDQ_Prosocial'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN with median value: {median_test_value}")

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

### 2.1.18: MRI_Track_Age_at_Scan

In [None]:
feature = 'MRI_Track_Age_at_Scan'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

In [None]:
numeric_data = train_combined[numerical_features].copy()
imputer = IterativeImputer(estimator=LassoCV(random_state=SEED), max_iter=100, random_state=SEED)

numeric_data_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_data), 
    columns=numerical_features, 
    index=numeric_data.index
)

# Update MRI_Track_Age_at_Scan in train
train_combined['MRI_Track_Age_at_Scan'] = numeric_data_imputed['MRI_Track_Age_at_Scan']

In [None]:
# Prepare test numeric data with same columns
test_numeric_data = test_combined[numerical_features].copy()

# Transform only MRI_Track_Age_at_Scan using imputer from train
test_numeric_data_imputed = pd.DataFrame(
    imputer.transform(test_numeric_data),
    columns=numerical_features,
    index=test_numeric_data.index
)

# Update only this feature
test_combined['MRI_Track_Age_at_Scan'] = test_numeric_data_imputed['MRI_Track_Age_at_Scan']

In [None]:
Q1 = train_combined[feature].quantile(0.25)
Q3 = train_combined[feature].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = train_combined[(train_combined[feature] < lower_bound) | (train_combined[feature] > upper_bound)]
print(f"Number of outliers in '{feature}': {len(outliers)}")

plt.figure(figsize=(8, 1.5))
sns.boxplot(x=train_combined[feature], color="lightcoral")
plt.title(f'Boxplot of {feature}')
plt.xlabel(feature)
plt.tight_layout()
plt.show()

In [None]:
train_combined.drop(columns=['EHQ_EHQ_Total', 'ColorVision_CV_Score'], inplace=True)
test_combined.drop(columns=['EHQ_EHQ_Total', 'ColorVision_CV_Score'], inplace=True)

## 2.2 Categorical Features Visualization

TOTAL OF 12 FEATURE

### Explanations based on domain knowledge
+ Basic_Demos_Enroll_Year: the year when the participant enrolled in the study (int) (NOT VERY IMPORTANT)
+ Basic_Demos_Study_Site: Location/site where the subject was assessed (NOT VERY IMPORTANT)
+ PreInt_Demos_Fam_Child_Ethnicity: Ethnic background of the child
+ PreInt_Demos_Fam_Child_Race: Race of the child
+ MRI_Track_Scan_Location: Where the MRI was performed
+ Barratt_Barratt_P1_Edu: education of the parent 1 (ORDINAL)
+ Barratt_Barratt_P1_Occ: occupation of parent 1 (ORDINAL)
+ Barratt_Barratt_P2_Edu: education of the parent 2 (ORDINAL)
+ Barratt_Barratt_P2_Occ: occupation of parent 2 (ORDINAL)
+ Laterality_Category: Categorical brain lateralization: left, middle, or right
+ ColorVision_Level: Categorical encoding of color vision test (BINARY)
+ APQ_CP_is_high: Is Corporal Punishment score high (>6) (BINARY)

### Features Important in Predicting Sex 
+ PreInt_Demos_Fam_Child_Ethnicity (SOMEWHAT IMPORTANT)
+ PreInt_Demos_Fam_Child_Race (SOMEWHAT IMPORTANT)
+ Barratt_Barratt_P1_Edu (SOMEWHAT IMPORTANT)
+ Barratt_Barratt_P1_Occ (SOMEWHAT IMPORTANT)
+ Barratt_Barratt_P2_Edu (SOMEWHAT IMPORTANT)
+ Barratt_Barratt_P2_Occ (SOMEWHAT IMPORTANT)
+ Laterality_Category (SUPER IMPORTANT)
+ ColorVision_Level (SUPER IMPORTANT)
+ APQ_CP_is_high (SOMEWHAT IMPORTANT)

### Features Important in Predicting ADHD
+ PreInt_Demos_Fam_Child_Ethnicity (SUPER IMPORTANT)
+ PreInt_Demos_Fam_Child_Race (SUPER IMPORTANT)
+ Barratt_Barratt_P1_Edu (SUPER IMPORTANT)
+ Barratt_Barratt_P1_Occ (SUPER IMPORTANT)
+ Barratt_Barratt_P2_Edu (SOMEWHAT IMPORTANT)
+ Barratt_Barratt_P2_Occ (SOMEWHAT IMPORTANT)
+ Laterality_Category (SUPER IMPORTANT)
+ ColorVision_Level (SOMEWHAT IMPORTANT)
+ APQ_CP_is_high (SUPER IMPORTANT)

In [None]:
categorical_features = [
    'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
    'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
    'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
    'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
    'Barratt_Barratt_P2_Occ', 'Laterality_Category', 'ColorVision_Level',
    'APQ_CP_is_high'
]

print(len(categorical_features))   

### 2.2.1 Basic_Demos_Enroll_Year

In [None]:
feature = 'Basic_Demos_Enroll_Year'
nan_count_train = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count_train}")
nan_count_test = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_count_test}")

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette="Set2")
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Compute rare categories from TRAIN only
threshold = 50

# Step 1: Handle rare categories in TRAIN
category_counts = train_combined[feature].value_counts()
rare_categories = category_counts[category_counts < threshold].index
train_combined[feature] = train_combined[feature].apply(
    lambda x: 'Other' if x in rare_categories else str(x)
)

# Step 2: Handle rare + unseen categories in TEST
# Convert to str first so categories match
test_combined[feature] = test_combined[feature].astype(str)

# Map test values to 'Other' if not in train (incl. unseen or rare ones)
allowed_categories = set(train_combined[feature].unique())

test_combined[feature] = test_combined[feature].apply(
    lambda x: x if x in allowed_categories else 'Other'
)

plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette="Set2", order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature} (after grouping rare categories)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
train_combined[feature] = train_combined[feature].replace({4: 0})
test_combined[feature] = test_combined[feature].replace({4: 0})

### 2.2.2: Basic_Demos_Study_Site

In [None]:
feature = 'Basic_Demos_Study_Site'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_count_test = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_count_test}")

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Combine 2, 3, and 4 into 2
train_combined[feature] = train_combined[feature].replace({3: 2, 4: 2})
test_combined[feature] = test_combined[feature].replace({3: 2, 4: 2})

# Step 2: Handle rare categories
threshold = 25
category_counts = train_combined[feature].value_counts()
rare_categories = category_counts[category_counts < threshold].index

print(f"Rare categories in '{feature}' (fewer than {threshold} instances):")
print(rare_categories)

# Step 3: Replace rare categories with 'Other'
train_combined[feature] = train_combined[feature].apply(
    lambda x: 'Other' if x in rare_categories else str(x)
)
test_combined[feature] = test_combined[feature].apply(
    lambda x: 'Other' if x in rare_categories else str(x)
)

# Step 4: Plot updated TRAIN distribution
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature} (after combining 2/3/4 and cleaning)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2.2.3: PreInt_Demos_Fam_Child_Ethnicity

In [None]:
feature = 'PreInt_Demos_Fam_Child_Ethnicity'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_count_test = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_count_test}")

In [None]:
train_combined[feature] = train_combined[feature].fillna(3.0)
test_combined[feature] = test_combined[feature].fillna(3.0)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# combine values 2 and 3
train_combined[feature] = train_combined[feature].replace({3: 2})
test_combined[feature] = test_combined[feature].replace({3: 2})

plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature} (2 and 3 combined)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2.2.4: PreInt_Demos_Fam_Child_Race

In [None]:
feature = 'PreInt_Demos_Fam_Child_Race'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
train_combined[feature] = train_combined[feature].fillna(10.0)
test_combined[feature] = test_combined[feature].fillna(10.0)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Map values
train_combined[feature] = train_combined[feature].apply(lambda x: "White" if x == 0.0 else "Non-White")
test_combined[feature] = test_combined[feature].apply(lambda x: "White" if x == 0.0 else "Non-White")

# Step 2: Plot
plt.figure(figsize=(6, 4))
sns.countplot(x=train_combined[feature], palette='pastel', 
              order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature} (White vs Non-White)")
plt.tight_layout()
plt.show()

### 2.2.5: MRI_Track_Scan_Location

In [None]:
feature = 'MRI_Track_Scan_Location'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
train_combined[feature] = train_combined[feature].fillna(4.0)
test_combined[feature] = test_combined[feature].fillna(4.0)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# combine values 1 and 4
train_combined[feature] = train_combined[feature].replace({4: 1})
test_combined[feature] = test_combined[feature].replace({4: 1})

plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2.2.6: Barratt_Barratt_P1_Edu

In [None]:
feature = 'Barratt_Barratt_P1_Edu'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
train_combined[feature] = train_combined[feature].fillna(18.0)
test_combined[feature] = test_combined[feature].fillna(18.0)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def collapse_edu(value):
    if value == 21.0:
        return "Upper_College"
    elif value == 18.0:
        return "College"
    else:
        return "Under_College"
    
train_combined[feature] = train_combined[feature].apply(collapse_edu)
test_combined[feature] = test_combined[feature].apply(collapse_edu)

In [None]:
# Plot updated distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=train_combined[feature], palette='Set2', order=train_combined[feature].value_counts().index)
plt.title(f"Collapsed Distribution of {feature}")
plt.tight_layout()
plt.show()

### 2.2.7: Barratt_Barratt_P1_Occ

In [None]:
feature = 'Barratt_Barratt_P1_Occ'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
median_value = train_combined[feature].median()
train_combined[feature] = train_combined[feature].fillna(median_value)
print(f"Filled NaN in '{feature}' with median value: {median_value}")

median_test_value = test_combined[feature].median()
test_combined[feature] = test_combined[feature].fillna(median_test_value)
print(f"Filled NaN in '{feature}' with median value: {median_test_value}")

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def collapse_occupation(val):
    if val in [0.0, 5.0, 10.0, 15.0]:
        return "Low"
    elif val in [20.0, 25.0]:
        return "LowerMid"
    elif val in [30.0, 35.0, 40.0]:
        return "UpperMid"
    elif val == 45.0:
        return "High"
    else:
        return "Other"  # Safety fallback

feature = "Barratt_Barratt_P1_Occ"

train_combined[feature] = train_combined[feature].apply(collapse_occupation)
test_combined[feature] = test_combined[feature].apply(collapse_occupation)

### 2.2.8: Barratt_Barratt_P2_Edu

In [None]:
feature = 'Barratt_Barratt_P2_Edu'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
def collapse_edu(value):
    if value == 21.0:
        return "Upper_College"
    elif value == 18.0:
        return "College"
    else:
        return "Under_College"
    
train_combined[feature] = train_combined[feature].apply(collapse_edu)
test_combined[feature] = test_combined[feature].apply(collapse_edu)

In [None]:
def impute_p2_edu(row):
    if pd.isna(row['Barratt_Barratt_P2_Edu']):
        return row['Barratt_Barratt_P1_Edu']
    return row['Barratt_Barratt_P2_Edu']

train_combined['Barratt_Barratt_P2_Edu'] = train_combined.apply(impute_p2_edu, axis=1)
test_combined['Barratt_Barratt_P2_Edu'] = test_combined.apply(impute_p2_edu, axis=1)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2.2.9: Barratt_Barratt_P2_Occ

In [None]:
feature = 'Barratt_Barratt_P2_Occ'
nan_count = train_combined[feature].isna().sum()
print(f"Number of NaN rows in feature '{feature}': {nan_count}")

nan_test_count = test_combined[feature].isna().sum()
print(f"Number of NaN rows in test feature '{feature}': {nan_test_count}")

In [None]:
def collapse_occupation(val):
    if val in [0.0, 5.0, 10.0, 15.0]:
        return "Low"
    elif val in [20.0, 25.0]:
        return "LowerMid"
    elif val in [30.0, 35.0, 40.0]:
        return "UpperMid"
    elif val == 45.0:
        return "High"
    else:
        return "Other"  # Safety fallback

feature = "Barratt_Barratt_P2_Occ"

train_combined[feature] = train_combined[feature].apply(collapse_occupation)
test_combined[feature] = test_combined[feature].apply(collapse_occupation)

In [None]:
def impute_p2_edu(row):
    if pd.isna(row['Barratt_Barratt_P2_Occ']):
        return row['Barratt_Barratt_P1_Occ']
    return row['Barratt_Barratt_P2_Occ']

train_combined['Barratt_Barratt_P2_Occ'] = train_combined.apply(impute_p2_edu, axis=1)
test_combined['Barratt_Barratt_P2_Occ'] = test_combined.apply(impute_p2_edu, axis=1)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x=train_combined[feature], palette='pastel', order=train_combined[feature].value_counts().index)
plt.title(f"Distribution of {feature}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
train_combined.drop(columns=['MRI_Track_Scan_Location', 'Basic_Demos_Study_Site'], inplace=True)
test_combined.drop(columns=['MRI_Track_Scan_Location', 'Basic_Demos_Study_Site'], inplace=True)

### 2.2.10 LABEL ENCODING

In [None]:
label_encode_features = [
    'Basic_Demos_Enroll_Year',
    'Barratt_Barratt_P1_Edu',
    'Barratt_Barratt_P1_Occ',
    'Barratt_Barratt_P2_Edu',
    'Barratt_Barratt_P2_Occ',
    'Laterality_Category'
]

from sklearn.preprocessing import LabelEncoder

encoders = {}  # Store encoders if needed for inverse_transform later

for feature in label_encode_features:
    le = LabelEncoder()
    train_combined[feature] = le.fit_transform(train_combined[feature])
    test_combined[feature] = le.transform(test_combined[feature])
    encoders[feature] = le  # Save encoder

In [None]:
# Updated binary map: White = 1, Non-White = 0
binary_map = {
    'White': 1,
    'Non-White': 0,
}

# Apply to 'PreInt_Demos_Fam_Child_Race'
train_combined['PreInt_Demos_Fam_Child_Race'] = train_combined['PreInt_Demos_Fam_Child_Race'].map(binary_map)
test_combined['PreInt_Demos_Fam_Child_Race'] = test_combined['PreInt_Demos_Fam_Child_Race'].map(binary_map)

# 3. Final

In [None]:
train_combined.columns

In [None]:
test_combined

In [None]:
for feature in train_combined.columns:
    num_missing = train_combined[feature].isna().sum()
    print(f"{feature}: {num_missing} missing values")

In [None]:
for feature in test_combined.columns:
    num_missing = test_combined[feature].isna().sum()
    print(f"{feature}: {num_missing} missing values")

In [None]:
# Reset index to include participant_id as a column
train_combined.reset_index(inplace=True)
test_combined.reset_index(inplace=True)

In [None]:
train_combined.to_csv('train_processed.csv', index=False)
test_combined.to_csv('test_processed.csv', index=False)

print("Complete!")