# 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score, brier_score_loss
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from scipy.stats import ks_2samp, mannwhitneyu

import warnings
warnings.filterwarnings("ignore")

SEED = 42
REPEATS = 5
FOLDS = 5

In [None]:
train_combined = pd.read_csv("train_processed.csv").set_index('participant_id')   
test_combined = pd.read_csv("test_processed.csv").set_index('participant_id')  

labels = pd.read_excel("data\TRAIN\TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train_combined = train_combined.sort_index()
labels = labels.sort_index()

y_adhd = labels['ADHD_Outcome']
y_sex = labels['Sex_F']

combinations = y_adhd.astype(str) + y_sex.astype(str)
assert all(train_combined.index == labels.index), "Label IDs do not match train IDs"

def eval_metrics(y_true, y_pred, weights, label="None", threshold=0.6):
    brier = brier_score_loss(y_true, y_pred)
    f1 = f1_score(y_true, (y_pred > threshold).astype(int), sample_weight=weights)
    print(f"{label} -> Brier Score: {brier:.4f}, F1: {f1:.4f}")
    return brier, f1

In [None]:
scaler = StandardScaler()
train_combined = pd.DataFrame(scaler.fit_transform(train_combined), columns=train_combined.columns, index=train_combined.index)
test_combined = pd.DataFrame(scaler.transform(test_combined), columns=test_combined.columns, index=test_combined.index)

In [None]:
train_combined.columns

# 2. Feature Selection

In [None]:
features_adhd = ['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Prosocial', 'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_ID']

interactions = ['SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Peer_Problems', 'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_ID']

features_sex = ['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Prosocial', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Emotional_Problems', 'ColorVision_CV_Score', 'APQ_P_APQ_P_PP', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'SDQ_SDQ_Internalizing'] + ['adhd_proba'] + [f"I_{feat}" for feat in interactions]

In [None]:
# Initialize out-of-fold arrays and scores
sex_oof = np.zeros(len(y_sex))
adhd_oof = np.zeros(len(y_adhd))
scores_sex = []
scores_adhd = []

t_sex = 0.3
t_adhd = 0.4

rskf = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=SEED)
skf = StratifiedKFold(n_splits=FOLDS)

# Updated parameters including class_weight to balance the classes based on their distribution
params = {
    "penalty": "l1",
    "Cs": 10,
    "cv": skf,
    "fit_intercept": True,
    "scoring": "f1",
    "random_state": SEED,
    "solver": "saga",
    "class_weight": "balanced"   # This tells the model to adjust weights inversely proportional to class frequencies
}

model_adhd = LogisticRegressionCV(**params)
model_sex  = LogisticRegressionCV(**params)

for fold, (train_idx, val_idx) in enumerate(rskf.split(train_combined, combinations), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = train_combined.iloc[train_idx].copy(), train_combined.iloc[val_idx].copy()
    y_train_adhd, y_val_adhd = y_adhd.iloc[train_idx], y_adhd.iloc[val_idx]
    y_train_sex, y_val_sex = y_sex.iloc[train_idx], y_sex.iloc[val_idx]
    
    # Sample weights: 2x weight for "11" (female ADHD) cases, per competition requirements
    weights_train = np.where(combinations.iloc[train_idx] == "11", 2, 1)
    weights = np.where(combinations.iloc[val_idx] == "11", 2, 1)
    
    # ----- First Stage: ADHD Model -----
    model_adhd.fit(X_train[features_adhd], y_train_adhd, sample_weight=weights_train)
    adhd_train = model_adhd.predict_proba(X_train[features_adhd])[:, 1]
    adhd_val   = model_adhd.predict_proba(X_val[features_adhd])[:, 1]
    adhd_oof[val_idx] += adhd_val / REPEATS
    adhd_brier, adhd_f1 = eval_metrics(y_val_adhd, adhd_val, weights, "ADHD", t_adhd)
    scores_adhd.append((adhd_brier, adhd_f1))
    
    # ----- Second Stage: Sex Model -----
    # Add ADHD probability and create interaction features on both training and validation sets
    X_train["adhd_proba"] = adhd_train
    X_val["adhd_proba"] = adhd_val
    for feat in interactions:
        X_train[f"I_{feat}"] = X_train[feat] * X_train["adhd_proba"]
        X_val[f"I_{feat}"] = X_val[feat] * X_val["adhd_proba"]

    model_sex.fit(X_train[features_sex], y_train_sex, sample_weight=weights_train)
    sex_val = model_sex.predict_proba(X_val[features_sex])[:, 1]
    sex_oof[val_idx] += sex_val / REPEATS
    sex_brier, sex_f1 = eval_metrics(y_val_sex, sex_val, weights, "Sex_F", t_sex)
    scores_sex.append((sex_brier, sex_f1))
    
print(f"\n=== CV Results ===")
print(f"Sex Mean Brier Score: {np.mean([s[0] for s in scores_sex]):.4f}")
print(f"Sex Mean F1: {np.mean([s[1] for s in scores_sex]):.4f}")
print(f"ADHD Mean Brier Score: {np.mean([s[0] for s in scores_adhd]):.4f}")
print(f"ADHD Mean F1: {np.mean([s[1] for s in scores_adhd]):.4f}")

# 3. Threshold Optimization

In [None]:
weights = ((y_adhd == 1) & (y_sex == 1)) + 1
thresholds = np.linspace(0, 1, 100)

# ADHD threshold optimization
adhd_scores = [f1_score(y_adhd, (adhd_oof > t).astype(int), sample_weight=weights) for t in thresholds]
best_adhd_threshold = thresholds[np.argmax(adhd_scores)]
best_adhd_score = max(adhd_scores)

# Sex threshold optimization
sex_scores = [f1_score(y_sex, (sex_oof > t).astype(int), sample_weight=weights) for t in thresholds]
best_sex_threshold = thresholds[np.argmax(sex_scores)]
best_sex_score = max(sex_scores)

# # Plot the threshold analyses and score distributions
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
axs[0, 0].plot(thresholds, adhd_scores, color='orange')
axs[0, 0].scatter(best_adhd_threshold, best_adhd_score, color='red')
axs[0, 0].set_title('F1 vs Threshold (ADHD)')
axs[0, 1].hist(adhd_oof, bins=30, edgecolor='black')
axs[0, 1].set_title('ADHD OOF Distribution')
axs[1, 0].plot(thresholds, sex_scores, color='blue')
axs[1, 0].scatter(best_sex_threshold, best_sex_score, color='red')
axs[1, 0].set_title('F1 vs Threshold (Sex)')
axs[1, 1].hist(sex_oof, bins=30, edgecolor='black')
axs[1, 1].set_title('Sex OOF Distribution')
plt.tight_layout()
plt.show()

# 4. Inference

In [None]:
model_adhd.fit(train_combined[features_adhd], y_adhd, sample_weight=weights)
adhd_proba_train = model_adhd.predict_proba(train_combined[features_adhd])[:, 1]
adhd_proba_test = model_adhd.predict_proba(test_combined[features_adhd])[:, 1]

train_combined["adhd_proba"] = adhd_proba_train
test_combined["adhd_proba"] = adhd_proba_test

for feat in interactions:
    train_combined[f"I_{feat}"] = train_combined["adhd_proba"] * train_combined[feat]
    test_combined[f"I_{feat}"] = test_combined["adhd_proba"] * test_combined[feat]

model_sex.fit(train_combined[features_sex], y_sex, sample_weight=weights)
sex_proba_test = model_sex.predict_proba(test_combined[features_sex])[:, 1]

In [None]:
# Display the top 15 most influential features for the ADHD model:
coeffs = pd.DataFrame({"feature": features_adhd, "coeff": model_adhd.coef_[0]})
print(coeffs.sort_values(by="coeff", key=np.abs, ascending=False).head(15))

# Perform distribution comparison tests between out-of-fold predictions and test probabilities
print("KS and Mann-Whitney U Tests:")
print("ADHD KS:", ks_2samp(adhd_proba_test, adhd_oof))
print("ADHD MWU:", mannwhitneyu(adhd_proba_test, adhd_oof))
print("Sex KS:", ks_2samp(sex_proba_test, sex_oof))
print("Sex MWU:", mannwhitneyu(sex_proba_test, sex_oof))

# Submission

In [None]:
submission = pd.read_excel("data/SAMPLE_SUBMISSION.xlsx"
                           )
submission["ADHD_Outcome"] = np.where(adhd_proba_test > best_adhd_threshold, 1, 0)
submission["Sex_F"] = np.where(sex_proba_test > best_sex_threshold, 1, 0)
# Compare share of predicted labels at thresholds between OOF and Test
print(f"Share ADHD OOF: {np.mean(np.where(adhd_oof > best_adhd_threshold, 1, 0)):.4f} - Share ADHD Test: {submission.ADHD_Outcome.mean():.4f}")
print(f"Share Sex_F OOF: {np.mean(np.where(sex_oof > best_sex_threshold, 1, 0)):.4f} - Share Sex_F Test: {submission.Sex_F.mean():.4f}")

submission.to_csv("submission_ver2.csv", index=False)

Version 1
+ Share ADHD OOF: 0.8384 - Share ADHD Test: 0.8454
+ Share Sex_F OOF: 0.9563 - Share Sex_F Test: 0.9013