# 1. Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif, mutual_info_classif
from scipy.stats import pointbiserialr
import warnings
warnings.filterwarnings("ignore")
# Set visual style
sns.set_theme(style="whitegrid")

%matplotlib inline

SEED  = 42

In [2]:
train = pd.read_csv('train_processed.csv').set_index("participant_id")
test = pd.read_csv('test_processed.csv').set_index("participant_id")

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train = train.sort_index()
labels = labels.sort_index()
assert all(train.index == labels.index), "Label IDs do not match train IDs"

X_train = train.copy()
y_train_sex = labels['Sex_F']
y_train_adhd = labels['ADHD_Outcome']

In [3]:
# label_df = labels.copy()

# # Plot: ADHD_Outcome
# plt.figure(figsize=(4, 4))
# sns.countplot(x=label_df["ADHD_Outcome"], palette="Set2")
# plt.title("Distribution of ADHD_Outcome")
# plt.xticks([0, 1], ['No ADHD (0)', 'ADHD (1)'])
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

# # Plot: Sex_F
# plt.figure(figsize=(4, 4))
# sns.countplot(x=label_df["Sex_F"], palette="pastel")
# plt.title("Distribution of Sex_F")
# plt.xticks([0, 1], ['Male (0)', 'Female (1)'])
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

# 2. Feature Correlation

In [4]:
F_sex, p_sex = f_classif(X_train, y_train_sex)
mi_sex = mutual_info_classif(X_train, y_train_sex, random_state=SEED)

F_adhd, p_adhd = f_classif(X_train, y_train_adhd)
mi_adhd = mutual_info_classif(X_train, y_train_adhd, random_state=SEED)

corr_sex = []
corr_adhd = []

for feature in X_train.columns:
    try:
        corr_s, _ = pointbiserialr(X_train[feature], y_train_sex)
        corr_a, _ = pointbiserialr(X_train[feature], y_train_adhd)
    except:
        corr_s, corr_a = float('nan'), float('nan')
    corr_sex.append(corr_s)
    corr_adhd.append(corr_a)

In [6]:
len(X_train.columns)

26

In [5]:
features = X_train.columns

df_sex = pd.DataFrame({
    "feature": features,
    "MI": mi_sex,
    # "F_stat": F_sex,
    "p_value": p_sex,
    "correlation": corr_sex
}).sort_values(by="p_value", ascending=True)

df_adhd = pd.DataFrame({
    "feature": features,
    "MI": mi_adhd,
    # "F_stat": F_adhd,
    "p_value": p_adhd,
    "correlation": corr_adhd
}).sort_values(by="p_value", ascending=True)

In [7]:
df_sex

Unnamed: 0,feature,MI,p_value,correlation
13,SDQ_SDQ_Hyperactivity,0.033005,8.508961e-07,-0.140801
16,SDQ_SDQ_Prosocial,0.017117,1.686387e-05,0.1232
11,SDQ_SDQ_Externalizing,0.00216,2.73256e-05,-0.120127
10,SDQ_SDQ_Emotional_Problems,0.016073,0.000169272,0.107782
1,ColorVision_CV_Score,0.0,0.0002374002,0.105347
7,APQ_P_APQ_P_PP,0.0,0.0154967,-0.069488
4,APQ_P_APQ_P_INV,0.037198,0.02605546,-0.063897
5,APQ_P_APQ_P_OPD,0.0,0.03451433,-0.060705
14,SDQ_SDQ_Internalizing,0.001086,0.0478766,0.056819
8,SDQ_SDQ_Conduct_Problems,0.0,0.0824584,-0.049882


In [None]:
df_adhd

# 3. Feature Engineering

## 3.1 Arbitrary Threshold

In [None]:
# Filter the DataFrame for features with p-value < 0.05 and abs(correlation) > 0.1
df_sex_filtered = df_sex[(df_sex["p_value"] < 0.05) & (df_sex["correlation"].abs() > 0.05)]

print("Selected features for sex prediction based on p-value and correlation thresholds:")
print(df_sex_filtered['feature'].tolist())

In [None]:
df_adhd_filtered = df_adhd[(df_adhd["p_value"] < 0.05) & (df_adhd["correlation"].abs() > 0.1)]

print("Selected features for sex prediction based on p-value and correlation thresholds:")
print(df_adhd_filtered['feature'].tolist())

In [None]:
# For the ADHD target
top10_p_adhd = df_adhd.nsmallest(10, "p_value")
top10_corr_adhd = df_adhd.nlargest(10, "correlation")
top10_mi_adhd = df_adhd.nlargest(10, "MI")

print("\nTop 10 Features for ADHD Prediction based on p-value:")
print(top10_p_adhd)
print("\nTop 10 Features for ADHD Prediction based on correlation:")
print(top10_corr_adhd)
print("\nTop 10 Features for ADHD Prediction based on Mutual Information:")
print(top10_mi_adhd)

## 3.2 Feature Selection Using Tree-based Algorithm

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train_adhd)  # Can be applied similarly for y_train_adhd

# Use SelectFromModel to automatically select features based on importances
selector = SelectFromModel(rf, threshold="median")  # or a custom threshold (e.g., 0.01)
X_train_selected = selector.transform(X_train)
selected_features = X_train.columns[selector.get_support()]

print(selected_features.tolist())

['EHQ_EHQ_Total', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'MRI_Track_Age_at_Scan', 'Basic_Demos_Enroll_Year']


In [21]:
features_adhd = ['EHQ_EHQ_Total', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'MRI_Track_Age_at_Scan', 'Basic_Demos_Enroll_Year']

len(features_adhd)

13

In [19]:
rf_sex = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train_sex)  

# Use SelectFromModel to automatically select features based on importances
selector = SelectFromModel(rf, threshold="median")  # or a custom threshold (e.g., 0.01)
X_train_selected = selector.transform(X_train)
selected_features = X_train.columns[selector.get_support()]

print(selected_features.tolist())

['EHQ_EHQ_Total', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan']


In [20]:
features_sex = ['EHQ_EHQ_Total', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan']

len(features_sex)

13