# 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif, mutual_info_classif
from scipy.stats import pointbiserialr

# Set visual style
sns.set_theme(style="whitegrid")

%matplotlib inline

SEED  = 42

In [None]:
train = pd.read_csv('train_processed.csv').set_index("participant_id")
test = pd.read_csv('test_processed.csv').set_index("participant_id")

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train = train.sort_index()
labels = labels.sort_index()
assert all(train.index == labels.index), "Label IDs do not match train IDs"

In [None]:
labels.columns

# 2. Feature Correlation

In [None]:
train.head()

In [None]:
target_columns = ['Sex_F', 'ADHD_Outcome']

X_train = train.copy()
y_train_sex = labels['Sex_F']
y_train_adhd = labels['ADHD_Outcome']

print("Features shape:", X_train.shape)
print("Sex target shape:", y_train_sex.shape)
print("ADHD target shape:", y_train_adhd.shape)

In [None]:
# Computing mutual information and ANOVA F-statistics for the targets
F_sex, p_sex = f_classif(X_train, y_train_sex)
mi_sex = mutual_info_classif(X_train, y_train_sex, random_state=42)

F_adhd, p_adhd = f_classif(X_train, y_train_adhd)
mi_adhd = mutual_info_classif(X_train, y_train_adhd, random_state=42)

# Step 2: Compute correlation scores
corr_sex = []
corr_adhd = []

for feature in X_train.columns:
    try:
        corr_s, _ = pointbiserialr(X_train[feature], y_train_sex)
        corr_a, _ = pointbiserialr(X_train[feature], y_train_adhd)
    except:
        corr_s, corr_a = float('nan'), float('nan')
    corr_sex.append(corr_s)
    corr_adhd.append(corr_a)

In [None]:
from scipy.stats import pointbiserialr

numeric_features = ['APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
       'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan']

In [None]:
features = X_train.columns

df_sex = pd.DataFrame({
    "feature": features,
    "MI": mi_sex,
    "F_stat": F_sex,
    "p_value": p_sex,
    "correlation": corr_sex
}).sort_values(by="correlation", ascending=False)

df_adhd = pd.DataFrame({
    "feature": features,
    "MI": mi_adhd,
    "F_stat": F_adhd,
    "p_value": p_adhd,
    "correlation": corr_adhd
}).sort_values(by="correlation", ascending=False)

In [None]:
df_sex

In [None]:
df_adhd

In [None]:
p_value_thresh = 0.05
mi_thresh = df_adhd["MI"].median()  # or: df_adhd["MI"].quantile(0.75)

selected_features_adhd = df_adhd[
    (df_adhd["p_value"] < p_value_thresh) & (df_adhd["MI"] > mi_thresh)
]

print("✅ Features selected for ADHD_Outcome based on thresholds:")
print(selected_features_adhd.sort_values(by="MI", ascending=False))

In [None]:
# Thresholds
mi_thresh_sex = df_sex["MI"].median()

selected_features_sex = df_sex[
    (df_sex["p_value"] < p_value_thresh) & (df_sex["MI"] > mi_thresh_sex)
]

print("✅ Features selected for Sex_F based on thresholds:")
print(selected_features_sex.sort_values(by="MI", ascending=False))

In [None]:
# Visualization for Sex_F target: p-values (lower is better)
plt.figure(figsize=(12,6))
sns.barplot(x="p_value", y="feature", data=df_sex.sort_values(by="p_value"))
plt.title("p-values from F-test for Sex_F")
plt.xlabel("p-value")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Visualization for ADHD_Outcome target: p-values
plt.figure(figsize=(12,6))
sns.barplot(x="p_value", y="feature", data=df_adhd.sort_values(by="p_value"))
plt.title("p-values from F-test for ADHD_Outcome")
plt.xlabel("p-value")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:
# Visualization for Sex_F target: Mutual Information Scores
plt.figure(figsize=(12,6))
sns.barplot(x="MI", y="feature", data=df_sex)
plt.title("Mutual Information Scores for Sex_F")
plt.xlabel("Mutual Information")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


# Visualization for ADHD_Outcome target: Mutual Information Scores
plt.figure(figsize=(12,6))
sns.barplot(x="MI", y="feature", data=df_adhd)
plt.title("Mutual Information Scores for ADHD_Outcome")
plt.xlabel("Mutual Information")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# 3. Feature Engineering