# 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif, mutual_info_classif
from scipy.stats import pointbiserialr
import warnings
warnings.filterwarnings("ignore")
# Set visual style
sns.set_theme(style="whitegrid")
%matplotlib inline
SEED  = 42

In [None]:
train_metadata = pd.read_csv('train_processed.csv').set_index("participant_id")
test_metadata = pd.read_csv('test_processed.csv').set_index("participant_id")

train_fmri = pd.read_csv("train_fMRI.csv").set_index("participant_id")
train_fmri = train_fmri.drop(columns=["Unnamed: 0"])

test_fmri = pd.read_csv("test_fMRI.csv").set_index("participant_id")
test_fmri = test_fmri.drop(columns=["Unnamed: 0"])

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train_combined = pd.concat([train_metadata, train_fmri], axis=1)
test_combined = pd.concat([test_metadata, test_fmri], axis=1)

train_combined = train_combined.sort_index()
test_combined = test_combined.sort_index()
labels = labels.sort_index()

assert all(train_combined.index == labels.index), "Label IDs do not match train_combined IDs"

y_train_sex = labels['Sex_F']
y_train_adhd = labels['ADHD_Outcome']

print("Merged Training Data Shape:", train_combined.shape)
print("Merged Test Data Shape:", test_combined.shape)

In [None]:
for df in [train_combined, test_combined]:
    df['Behavior_Male_Score'] = df['SDQ_SDQ_Hyperactivity'] + df['SDQ_SDQ_Conduct_Problems']
    df['Behavior_Female_Score'] = df['SDQ_SDQ_Emotional_Problems'] + df['SDQ_SDQ_Peer_Problems']
    df['Behavior_Imbalance'] = df['Behavior_Male_Score'] - df['Behavior_Female_Score']

# 2. Parenting Composite Scores
for df in [train_combined, test_combined]:
    df['Parenting_Negative'] = (df['APQ_P_APQ_P_CP'] + df['APQ_P_APQ_P_ID'] + df['APQ_P_APQ_P_PM'])
    df['Parenting_Positive'] = (df['APQ_P_APQ_P_INV'] + df['APQ_P_APQ_P_PP'])
    df['Parenting_Polarity'] = df['Parenting_Negative'] - df['Parenting_Positive']

# 3. Socio-Economic Status Composite (SES)
for df in [train_combined, test_combined]:
    df['SES_Score'] = (df['Barratt_Barratt_P1_Edu'] + df['Barratt_Barratt_P1_Occ'] +
                       df['Barratt_Barratt_P2_Edu'] + df['Barratt_Barratt_P2_Occ']) / 4.0

for df in [train_combined, test_combined]:
    df['Handedness_Binary'] = df['Laterality_Category'].apply(lambda x: 1 if 'left' in str(x).lower() else 0)

pca_features = [f'feature_{i}' for i in range(1, 21)]
for df in [train_combined, test_combined]:
    df['fMRI_Composite'] = df[pca_features].mean(axis=1)

for df in [train_combined, test_combined]:
    df['fMRI_Behavior_Interaction'] = df['fMRI_Composite'] * df['Behavior_Imbalance']
    df['Age_Adjusted_fMRI'] = df['fMRI_Composite'] * df['MRI_Track_Age_at_Scan']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_combined = pd.DataFrame(scaler.fit_transform(train_combined), columns=train_combined.columns, index=train_combined.index)
test_combined = pd.DataFrame(scaler.transform(test_combined), columns=test_combined.columns, index=test_combined.index)

In [None]:
# label_df = labels.copy()

# # Plot: ADHD_Outcome
# plt.figure(figsize=(4, 4))
# sns.countplot(x=label_df["ADHD_Outcome"], palette="Set2")
# plt.title("Distribution of ADHD_Outcome")
# plt.xticks([0, 1], ['No ADHD (0)', 'ADHD (1)'])
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

# # Plot: Sex_F
# plt.figure(figsize=(4, 4))
# sns.countplot(x=label_df["Sex_F"], palette="pastel")
# plt.title("Distribution of Sex_F")
# plt.xticks([0, 1], ['Male (0)', 'Female (1)'])
# plt.ylabel("Count")
# plt.tight_layout()
# plt.show()

# 2. Feature Correlation

In [None]:
F_sex, p_sex = f_classif(train_combined, y_train_sex)
mi_sex = mutual_info_classif(train_combined, y_train_sex, random_state=SEED)

F_adhd, p_adhd = f_classif(train_combined, y_train_adhd)
mi_adhd = mutual_info_classif(train_combined, y_train_adhd, random_state=SEED)

corr_sex = []
corr_adhd = []

for feature in train_combined.columns:
    try:
        corr_s, _ = pointbiserialr(train_combined[feature], y_train_sex)
        corr_a, _ = pointbiserialr(train_combined[feature], y_train_adhd)
    except:
        corr_s, corr_a = float('nan'), float('nan')
    corr_sex.append(corr_s)
    corr_adhd.append(corr_a)

In [None]:
features = train_combined.columns

df_sex = pd.DataFrame({
    "feature": features,
    "MI": mi_sex,
    # "F_stat": F_sex,
    "p_value": p_sex,
    "correlation": [abs(i) for i in corr_sex]
}).sort_values(by="p_value", ascending=True)

df_adhd = pd.DataFrame({
    "feature": features,
    "MI": mi_adhd,
    # "F_stat": F_adhd,
    "p_value": p_adhd,
    "correlation": [abs(i) for i in corr_adhd]
}).sort_values(by="p_value", ascending=True)

In [None]:
df_sex.head(20)

In [None]:
df_adhd.head(20)

In [None]:
train_combined.to_csv("train_feature_engineered.csv")
test_combined.to_csv("test_feature_engineered.csv")

# 3. Feature Engineering


## 3.1 Arbitrary Threshold

In [None]:
# Filter the DataFrame for features with p-value < 0.05 and abs(correlation) > 0.1
df_sex_filtered = df_sex[(df_sex["p_value"] < 0.05) & (df_sex["correlation"].abs() > 0.05)]
print(df_sex_filtered['feature'].tolist())

In [None]:
df_adhd_filtered = df_adhd[(df_adhd["p_value"] < 0.05) & (df_adhd["correlation"].abs() > 0.1)]
print(df_adhd_filtered['feature'].tolist())

In [None]:
# For the ADHD target
top10_p_adhd = df_adhd.nsmallest(10, "p_value")
top10_corr_adhd = df_adhd.nlargest(10, "correlation")
top10_mi_adhd = df_adhd.nlargest(10, "MI")

print("\nTop 10 Features for ADHD Prediction based on p-value:")
print(top10_p_adhd)
print("\nTop 10 Features for ADHD Prediction based on correlation:")
print(top10_corr_adhd)
print("\nTop 10 Features for ADHD Prediction based on Mutual Information:")
print(top10_mi_adhd)

## 3.2 Feature Selection Using Tree-based Algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_combined, y_train_adhd)  # Can be applied similarly for y_train_adhd

# Use SelectFromModel to automatically select features based on importances
selector = SelectFromModel(rf, threshold="median")  # or a custom threshold (e.g., 0.01)
X_train_selected = selector.transform(train_combined)
selected_features = train_combined.columns[selector.get_support()]

print(selected_features.tolist())

In [None]:
features_adhd = ['EHQ_EHQ_Total', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'MRI_Track_Age_at_Scan', 'Basic_Demos_Enroll_Year']

len(features_adhd)

In [None]:
rf_sex = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_combined, y_train_sex)  

# Use SelectFromModel to automatically select features based on importances
selector = SelectFromModel(rf, threshold="median")  # or a custom threshold (e.g., 0.01)
X_train_selected = selector.transform(train_combined)
selected_features = train_combined.columns[selector.get_support()]

print(selected_features.tolist())