# 1. Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import brier_score_loss, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from scipy.stats import ks_2samp, mannwhitneyu
from catboost import CatBoostClassifier 

import warnings
warnings.filterwarnings("ignore")

SEED = 42
REPEATS = 5
FOLDS = 5"

In [None]:
train_metadata = pd.read_csv('train_processed.csv').set_index("participant_id")
test_metadata = pd.read_csv('test_processed.csv').set_index("participant_id")

train_fmri = pd.read_csv("TRAIN_fMRI_PCA.csv").set_index("participant_id")
train_fmri = train_fmri.drop(columns=["Unnamed: 0"])

test_fmri = pd.read_csv("TEST_fMRI_PCA.csv").set_index("participant_id")
test_fmri = test_fmri.drop(columns=["Unnamed: 0"])

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train_combined = pd.concat([train_metadata, train_fmri], axis=1)
test_combined = pd.concat([test_metadata, test_fmri], axis=1)

train_combined = train_combined.sort_index()
test_combined = test_combined.sort_index()
labels = labels.sort_index()

assert all(train_combined.index == labels.index), "Label IDs do not match train IDs"

X_train = train_combined.copy()
y_train_sex = labels['Sex_F']
y_train_adhd = labels['ADHD_Outcome']

print("Merged Training Data Shape:", X_train.shape)
print("Merged Test Data Shape:", test_combined.shape)

In [None]:
train_combined

# 2. Feature Selection

In [None]:
# Standard scaling on both train and test datasets
scaler = StandardScaler()
train_combined = pd.DataFrame(
    scaler.fit_transform(train_combined),
    columns=train_combined.columns,
    index=train_combined.index,
)
test_combined = pd.DataFrame(
    scaler.transform(test_combined),
    columns=test_combined.columns,
    index=test_combined.index,
)

In [None]:
y_adhd = labels['ADHD_Outcome']
y_sex = labels['Sex_F']

combinations = y_adhd.astype(str) + y_sex.astype(str)

In [None]:
# Features chosen for the ADHD model
features_adhd = ['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Prosocial', 'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_ID']

interactions = ['SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Peer_Problems', 'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_ID']

sex = ['SDQ_SDQ_Hyperactivity', 'feature_15', 'SDQ_SDQ_Prosocial', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Emotional_Problems', 'ColorVision_CV_Score', 'feature_8', 'APQ_P_APQ_P_PP', 'feature_14', 'feature_16', 'APQ_P_APQ_P_INV', 'feature_18', 'APQ_P_APQ_P_OPD', 'feature_1', 'SDQ_SDQ_Internalizing']

# features_sex = ['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Prosocial', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Emotional_Problems', 'ColorVision_CV_Score', 'APQ_P_APQ_P_PP', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'SDQ_SDQ_Internalizing'] + ['adhd_proba'] + [f"I_{feat}" for feat in interactions]

def eval_metrics(y_true, y_pred, weights, label="None", threshold=0.5):
    brier = brier_score_loss(y_true, y_pred)
    f1 = f1_score(y_true, (y_pred > threshold).astype(int), sample_weight=weights)
    print(f"{label} -> Brier Score: {brier:.4f}, F1: {f1:.4f}")
    return brier, f1

In [None]:
sex_oof = np.zeros(len(y_sex))
adhd_oof = np.zeros(len(y_adhd))
scores_sex = []
scores_adhd = []

t_sex = 0.3  # threshold for sex prediction
t_adhd = 0.4 # threshold for ADHD prediction

rskf = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=SEED)
skf = StratifiedKFold(n_splits=FOLDS)

params = {
    "penalty": "l1",
    "Cs": 10,
    "cv": skf,
    "fit_intercept": True,
    "scoring": "f1",
    "random_state": SEED,
    "solver": "saga",
    "class_weight": "balanced"
}
model_adhd = LogisticRegressionCV(**params)

catboost_params = {
    # "loss_function": "LogLoss",
    "eval_metric": "F1",
    "random_seed": SEED,
    "verbose": 0,
    # "class_weight": None,
}

In [None]:
for fold, (train_idx, val_idx) in enumerate(rskf.split(train_combined, combinations), 1):
    print(f"\n=== Fold {fold} ===")
    X_train = train_combined.iloc[train_idx].copy()
    X_val = train_combined.iloc[val_idx].copy()
    y_train_adhd, y_val_adhd = y_adhd.iloc[train_idx], y_adhd.iloc[val_idx]
    y_train_sex, y_val_sex = y_sex.iloc[train_idx], y_sex.iloc[val_idx]
    
    # Set sample weights: upweight "11" cases (female with ADHD) with weight 2
    weights_train = np.where(combinations.iloc[train_idx] == "11", 2, 1)
    weights_val = np.where(combinations.iloc[val_idx] == "11", 2, 1)

    # --- Stage 1: ADHD Model Training ---
    # Train the ADHD model using selected features
    model_adhd.fit(X_train[features_adhd], y_train_adhd, sample_weight=weights_train)
    adhd_train = model_adhd.predict_proba(X_train[features_adhd])[:, 1]
    adhd_val = model_adhd.predict_proba(X_val[features_adhd])[:, 1]
    adhd_oof[val_idx] += adhd_val / REPEATS
    brier, f1 = eval_metrics(y_val_adhd, adhd_val, weights_val, label="ADHD", threshold=t_adhd)
    scores_adhd.append((brier, f1))
    
    # # --- Stage 2: Sex Model Training (with CatBoost) ---
    # # Add ADHD probabilities as a new feature
    # X_train["adhd_proba"] = adhd_train
    # X_val["adhd_proba"] = adhd_val
    # # Create interaction features by multiplying ADHD probability with each specified feature
    # for feat in interactions:
    #     X_train[f"I_{feat}"] = X_train[feat] * X_train["adhd_proba"]
    #     X_val[f"I_{feat}"] = X_val[feat] * X_val["adhd_proba"]

    # Initialize the CatBoost classifier for sex prediction
    model_sex = CatBoostClassifier(**catboost_params)
    # Train the sex model on the augmented features while passing sample weights for balancing
    model_sex.fit(X_train[sex], y_train_sex, sample_weight=weights_train)
    sex_val_pred = model_sex.predict_proba(X_val[sex])[:, 1]
    sex_oof[val_idx] += sex_val_pred / REPEATS
    brier_sex, f1_sex = eval_metrics(y_val_sex, sex_val_pred, weights_val, label="Sex_F", threshold=t_sex)
    scores_sex.append((brier_sex, f1_sex))
    

In [None]:
print(f"\n=== CV Results ===")
print(f"Sex Mean Brier Score: {np.mean([s[0] for s in scores_sex]):.4f}")
print(f"Sex Mean F1: {np.mean([s[1] for s in scores_sex]):.4f}")
print(f"ADHD Mean Brier Score: {np.mean([s[0] for s in scores_adhd]):.4f}")
print(f"ADHD Mean F1: {np.mean([s[1] for s in scores_adhd]):.4f}")

In [None]:
# y_adhd = labels['ADHD_Outcome']
# y_sex = labels['Sex_F']
# combinations = y_adhd.astype(str) + y_sex.astype(str)

# # Interaction terms (for ADHD-first -> then Sex model)
# interactions = [
#     "APQ_P_APQ_P_INV", "APQ_P_APQ_P_PP", "SDQ_SDQ_Hyperactivity", 
#     "MRI_Track_Age_at_Scan", "SDQ_SDQ_Generating_Impact", 'SDQ_SDQ_Emotional_Problems',
#     'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Conduct_Problems',
#     'SDQ_SDQ_Prosocial',
# ]

# # Features for the ADHD model (first stage)
# features_adhd = [
#     'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP',
#     'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD',
#     'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
#     'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
#     'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
#     'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing',
#     'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan',
#     'Basic_Demos_Enroll_Year', 'PreInt_Demos_Fam_Child_Ethnicity',
#     'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu',
#     'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
#     'Barratt_Barratt_P2_Occ', 'Laterality_Category'
# ]

# # Features for the Sex model (second stage), note we use the ADHD probability and corresponding interactions
# features_sex = features_adhd + ['adhd_proba'] + [f"I_{feat}" for feat in interactions]

In [None]:
weights_for_threshold = ((y_adhd == 1) & (y_sex == 1)) + 1

thresholds = np.linspace(0, 1, 100)
adhd_scores = [
    f1_score(y_adhd, (adhd_oof > t).astype(int), sample_weight=weights_for_threshold) 
    for t in thresholds
]

best_adhd_threshold = thresholds[np.argmax(adhd_scores)]
best_adhd_score = max(adhd_scores)
print(f"Best ADHD threshold: {best_adhd_threshold:.2f} with F1 score: {best_adhd_score:.4f}")

thresholds = np.linspace(0, 1, 100)
sex_scores = [
    f1_score(y_sex, (sex_oof > t).astype(int)) for t in thresholds
]
best_sex_threshold = thresholds[np.argmax(sex_scores)]
best_sex_score = max(sex_scores)
print(f"Best Sex threshold: {best_sex_threshold:.2f} with F1 score: {best_sex_score:.4f}")

# # Plot threshold optimization and OOF distribution for ADHD model
# fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# axs[0].plot(thresholds, adhd_scores, color='orange')
# axs[0].scatter(best_adhd_threshold, best_adhd_score, color='red')
# axs[0].set_title('F1 vs Threshold (ADHD)')
# axs[0].set_xlabel('Threshold')
# axs[0].set_ylabel('F1 Score')
# axs[1].hist(adhd_oof, bins=30, edgecolor='black')
# axs[1].set_title('ADHD OOF Distribution')
# plt.tight_layout()
# plt.show()

# 3. Threshold Optimization

In [None]:
# ----- Final ADHD Model on Full Training Data -----
model_adhd.fit(train_combined[features_adhd], y_adhd, sample_weight=weights_for_threshold)
# Get final ADHD predictions for both train and test sets
adhd_proba_train = model_adhd.predict_proba(train_combined[features_adhd])[:, 1]
adhd_proba_test = model_adhd.predict_proba(test_combined[features_adhd])[:, 1]

# Append ADHD predictions to training and test datasets
train_combined["adhd_proba"] = adhd_proba_train
test_combined["adhd_proba"] = adhd_proba_test

# Create interaction features for both train and test sets
for feat in interactions:
    train_combined[f"I_{feat}"] = train_combined["adhd_proba"] * train_combined[feat]
    test_combined[f"I_{feat}"] = test_combined["adhd_proba"] * test_combined[feat]

final_model_sex = CatBoostClassifier(**catboost_params)
final_model_sex.fit(train_combined[features_sex], y_sex, sample_weight=weights_for_threshold)
sex_proba_test = final_model_sex.predict_proba(test_combined[features_sex])[:, 1]

# 4. Inference

In [None]:
# # Display the top 15 most influential features for the ADHD model:
# coeffs = pd.DataFrame({"feature": features_adhd, "coeff": model_adhd.coef_[0]})
# print(coeffs.sort_values(by="coeff", key=np.abs, ascending=False).head(15))

# # Perform distribution comparison tests between out-of-fold predictions and test probabilities
# print("KS and Mann-Whitney U Tests:")
# print("ADHD KS:", ks_2samp(adhd_proba_test, adhd_oof))
# print("ADHD MWU:", mannwhitneyu(adhd_proba_test, adhd_oof))
# print("Sex KS:", ks_2samp(sex_proba_test, sex_oof))
# print("Sex MWU:", mannwhitneyu(sex_proba_test, sex_oof))

In [None]:
# Load the sample submission file (assumes it has a column for each prediction)
submission = pd.read_excel("data/SAMPLE_SUBMISSION.xlsx")

submission["ADHD_Outcome"] = np.where(adhd_proba_test > best_adhd_threshold, 1, 0)
submission["Sex_F"] = np.where(sex_proba_test > best_sex_threshold, 1, 0)

# Compare the share of positive predicted labels between out-of-fold (OOF) data and test predictions.
print(f"Share ADHD OOF: {np.mean(np.where(adhd_oof > best_adhd_threshold, 1, 0)):.4f} - Share ADHD Test: {submission.ADHD_Outcome.mean():.4f}")
print(f"Share Sex_F OOF: {np.mean(np.where(sex_oof > best_sex_threshold, 1, 0)):.4f} - Share Sex_F Test: {submission.Sex_F.mean():.4f}")

submission.to_csv("submission.csv", index=False)