# 1. Setup

In [None]:
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers

import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# Load quantitative and categorical data
train_q = pd.read_excel("data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
train_c = pd.read_excel("data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
test_q = pd.read_excel("data/TEST/TEST_QUANTITATIVE_METADATA.xlsx")
test_c = pd.read_excel("data/TEST/TEST_CATEGORICAL_METADATA.xlsx")

# Merge train and test data on 'participant_id'
train_combined = pd.merge(train_q, train_c, on='participant_id', how='left').set_index("participant_id")
test_combined  = pd.merge(test_q,  test_c, on='participant_id', how='left').set_index("participant_id")

# Load labels and sort indices
labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")
train_combined = train_combined.sort_index()
labels = labels.sort_index()
assert all(train_combined.index == labels.index), "Label IDs do not match train IDs"

# 2. Data Preprocessing

In [None]:
# Drop irrelevant columns (based on domain knowledge/data dictionary)
drop_cols = [
    "Basic_Demos_Study_Site", "MRI_Track_Scan_Location", 
    "PreInt_Demos_Fam_Child_Ethnicity", "PreInt_Demos_Fam_Child_Race", 
    'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Occ'
]
train_combined = train_combined.drop(columns=drop_cols)
test_combined  = test_combined.drop(columns=drop_cols)

# Standardize the data
scaler = StandardScaler()
train_combined = pd.DataFrame(scaler.fit_transform(train_combined), 
                              columns=train_combined.columns, 
                              index=train_combined.index)
test_combined = pd.DataFrame(scaler.transform(test_combined), 
                             columns=test_combined.columns, 
                             index=test_combined.index)

In [None]:
# Impute missing values using IterativeImputer with LassoCV
imputer = IterativeImputer(estimator=LassoCV(random_state=SEED), max_iter=5, random_state=SEED)
train_combined[:] = imputer.fit_transform(train_combined)
test_combined[:] = imputer.transform(test_combined)

print("Train shape:", train_combined.shape)
print("Test shape:", test_combined.shape)

In [None]:
# Define base features for Sex prediction (as provided by domain/data dictionary)
features_sex = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP',
    'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD',
    'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
    'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
    'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing',
    'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan',
    'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P2_Edu'
]

# For ADHD prediction, we include the Sex probability as an extra feature.
features_adhd = features_sex + ['sex_proba']

# List of features to later use for creating explicit interactions (if desired)
interaction_features = [
    "APQ_P_APQ_P_INV", "APQ_P_APQ_P_PP", "SDQ_SDQ_Hyperactivity", 
    "MRI_Track_Age_at_Scan", "SDQ_SDQ_Generating_Impact"
]

# Advanced Feature Engineering
for df in [train_combined, test_combined]:
    # Create APQ Parental Involvement Average
    df['APQ_Parental_Involvement_Avg'] = (df['APQ_P_APQ_P_INV'] + df['APQ_P_APQ_P_PP']) / 2
    
    # Difference between parent education levels
    df['Barratt_Edu_Diff'] = df['Barratt_Barratt_P1_Edu'] - df['Barratt_Barratt_P2_Edu']
    
    # Interaction between age and hyperactivity
    df['Age_Hyperactivity_Interaction'] = df['MRI_Track_Age_at_Scan'] * df['SDQ_SDQ_Hyperactivity']
    
    # Polynomial feature: cube of age
    df['MRI_Track_Age_at_Scan_cubed'] = df['MRI_Track_Age_at_Scan'] ** 3


In [None]:
print("Any NaNs in train_combined?", np.any(np.isnan(train_combined.values)))

# 3. Modeling

In [None]:
len(train_combined.columns)

In [None]:
def build_model(input_dim):
    inp = layers.Input(shape=(input_dim,))
    # Shared layers
    x = layers.Dense(128, activation='relu')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    # Sex branch
    sex_intermediate = layers.Dense(32, activation='relu', name='sex_intermediate')(x)
    sex_out = layers.Dense(1, activation='sigmoid', name="Sex_F")(sex_intermediate)
    
    # Combine features to mimic interactions (the network can learn them)
    combined = layers.concatenate([x, sex_intermediate, sex_out])
    
    # ADHD branch
    y = layers.Dense(32, activation='relu')(combined)
    adhd_out = layers.Dense(1, activation='sigmoid', name="ADHD_Outcome")(y)
    
    model = models.Model(inputs=inp, outputs=[adhd_out, sex_out])
    # Provide one metric per output: order must match outputs.
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', 'accuracy']
    )
    return model

input_dim = train_combined.shape[1]
model = build_model(input_dim)
model.summary()

In [None]:
# Extract targets
y_adhd = labels['ADHD_Outcome'].values
y_sex = labels['Sex_F'].values

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_adhd = np.zeros(len(train_combined))
oof_sex = np.zeros(len(train_combined))

fold = 1

In [None]:
for train_idx, val_idx in kf.split(train_combined):
    print(f"\n=== Fold {fold} ===")
    # Split data
    X_train = train_combined.iloc[train_idx].values
    X_val = train_combined.iloc[val_idx].values
    y_train_adhd, y_train_sex = y_adhd[train_idx], y_sex[train_idx]
    y_val_adhd, y_val_sex = y_adhd[val_idx], y_sex[val_idx]
    
    # Build new model for the fold
    fold_model = build_model(input_dim)
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    fold_model.fit(
        X_train,
        [y_train_adhd, y_train_sex],
        validation_data=(X_val, [y_val_adhd, y_val_sex]),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )
    
    # Predict on validation set
    pred_adhd, pred_sex = fold_model.predict(X_val)
    oof_adhd[val_idx] = pred_adhd.ravel()
    oof_sex[val_idx] = pred_sex.ravel()
    
    # Evaluation thresholds (tweak if needed)
    threshold_adhd = 0.4
    threshold_sex = 0.3
    
    f1_adhd = f1_score(y_val_adhd, (pred_adhd.ravel() > threshold_adhd).astype(int))
    f1_sex = f1_score(y_val_sex, (pred_sex.ravel() > threshold_sex).astype(int))
    print(f"Fold {fold} ADHD F1: {f1_adhd:.4f}, Sex F1: {f1_sex:.4f}")
    
    fold += 1

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
# 6. Threshold Optimization for Final Weighted F1
weights = ((y_adhd == 1) & (y_sex == 1)).astype(int) + 1

thresholds = np.linspace(0, 1, 100)


sex_scores = []
for t in tqdm(thresholds, desc="Sex Thresholds"):
    tmp_pred = (oof_sex > t).astype(int)
    tmp_score = f1_score(y_sex, tmp_pred, sample_weight=weights)
    sex_scores.append(tmp_score)
best_sex_threshold = thresholds[np.argmax(sex_scores)]
best_sex_score = np.max(sex_scores)
print(f"Best Sex Threshold: {best_sex_threshold:.2f} with F1: {best_sex_score:.3f}")

# Optimize threshold for ADHD prediction using weighted F1
adhd_scores = []
for t in tqdm(thresholds, desc="ADHD Thresholds"):
    tmp_pred = (oof_adhd > t).astype(int)
    tmp_score = f1_score(y_adhd, tmp_pred, sample_weight=weights)
    adhd_scores.append(tmp_score)
best_adhd_threshold = thresholds[np.argmax(adhd_scores)]
best_adhd_score = np.max(adhd_scores)
print(f"Best ADHD Threshold: {best_adhd_threshold:.2f} with F1: {best_adhd_score:.3f}")

# Plot the threshold optimization results
fig, axs = plt.subplots(2, 2, figsize=(12, 10), constrained_layout=True)

# Sex F1 vs Threshold
axs[0, 0].plot(thresholds, sex_scores, label='F1 Score (Sex)', color='blue')
axs[0, 0].scatter(best_sex_threshold, best_sex_score, color='red', 
                  label=f'Best: {best_sex_score:.3f}\n(Threshold: {best_sex_threshold:.2f})')
axs[0, 0].set_title('Sex F1 Scores vs Thresholds')
axs[0, 0].set_xlabel('Threshold')
axs[0, 0].set_ylabel('F1 Score')
axs[0, 0].legend()

# Distribution of Sex OOF Predictions
axs[0, 1].hist(oof_sex, bins=30, color='skyblue', edgecolor='black')
axs[0, 1].set_title('Distribution of Sex OOF Predictions')
axs[0, 1].set_xlabel('Probability')
axs[0, 1].set_ylabel('Frequency')

# ADHD F1 vs Threshold
axs[1, 0].plot(thresholds, adhd_scores, label='F1 Score (ADHD)', color='orange')
axs[1, 0].scatter(best_adhd_threshold, best_adhd_score, color='red', 
                  label=f'Best: {best_adhd_score:.3f}\n(Threshold: {best_adhd_threshold:.2f})')
axs[1, 0].set_title('ADHD F1 Scores vs Thresholds')
axs[1, 0].set_xlabel('Threshold')
axs[1, 0].set_ylabel('F1 Score')
axs[1, 0].legend()

# Distribution of ADHD OOF Predictions
axs[1, 1].hist(oof_adhd, bins=30, color='lightgreen', edgecolor='black')
axs[1, 1].set_title('Distribution of ADHD OOF Predictions')
axs[1, 1].set_xlabel('Probability')
axs[1, 1].set_ylabel('Frequency')

plt.suptitle('Threshold Optimization and Distributions', fontsize=16)
plt.show()

# Compute final weighted F1 using the best thresholds
final_pred_adhd = (oof_adhd > best_adhd_threshold).astype(int)
final_pred_sex  = (oof_sex  > best_sex_threshold).astype(int)

weighted_f1_adhd = f1_score(y_adhd, final_pred_adhd, sample_weight=weights)
weighted_f1_sex = f1_score(y_sex, final_pred_sex)
final_weighted_f1 = (weighted_f1_adhd + weighted_f1_sex) / 2
print("Final weighted F1 score:", final_weighted_f1)

# 4. Evaluation

In [None]:
# =====================================================
# 7. Final Model Training and Test Set Prediction
# =====================================================
final_model = build_model(input_dim)
final_early_stop = callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
final_model.fit(
    train_combined.values,
    [labels['ADHD_Outcome'].values, labels['Sex_F'].values],
    epochs=50,
    batch_size=32,
    callbacks=[final_early_stop],
    verbose=1
)

# Predict on test set
test_pred_adhd, test_pred_sex = final_model.predict(test_combined.values)


In [None]:
# After final model prediction on test set:
test_pred_adhd, test_pred_sex = final_model.predict(test_combined.values)

# Convert probabilities to binary using the best thresholds
final_test_pred_adhd = (test_pred_adhd > best_adhd_threshold).astype(int)
final_test_pred_sex  = (test_pred_sex > best_sex_threshold).astype(int)

# Prepare and save the submission with binary outputs
submission = pd.DataFrame({
    'participant_id': test_combined.index,
    'ADHD_Outcome': final_test_pred_adhd.ravel(),
    'Sex_F': final_test_pred_sex.ravel()
})
submission.to_csv("submission.csv", index=False)
print("Submission saved to submission.csv")