In [None]:
# %% [code]
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, clone
import os

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

SEED = 42
FOLDS = 5

# %% [code]
# Load data
train_combined = pd.read_csv("train_processed.csv").set_index("participant_id")
test_combined = pd.read_csv("test_processed.csv").set_index("participant_id")
labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

# Make sure the indices are sorted and aligned
train_combined = train_combined.sort_index()
labels = labels.sort_index()
assert all(train_combined.index == labels.index), "Label IDs do not match train IDs"

In [None]:
# %% [code]
# Define a helper function for additional feature engineering. 
def add_new_features(df):
    epsilon = 1e-5
    # --- Features based on high correlation analysis ---
    # Interaction: product of SDQ hyperactivity and externalizing
    df['I_Hyper_External'] = df['SDQ_SDQ_Hyperactivity'] * df['SDQ_SDQ_Externalizing']
    
    # Difference: externalizing minus internalizing
    df['Diff_External_Internal'] = df['SDQ_SDQ_Externalizing'] - df['SDQ_SDQ_Internalizing']
    
    # Ratio: externalizing divided by internalizing (avoid division by zero)
    df['Ratio_External_Internal'] = df['SDQ_SDQ_Externalizing'] / (df['SDQ_SDQ_Internalizing'] + epsilon)
    
    # Composite score: average of hyperactivity, externalizing, and emotional problems
    df['Composite_SDQ'] = (df['SDQ_SDQ_Hyperactivity'] + 
                           df['SDQ_SDQ_Externalizing'] + 
                           df['SDQ_SDQ_Emotional_Problems']) / 3.0
    # Log transform to reduce skewness
    df['Log_Composite_SDQ'] = np.log1p(df['Composite_SDQ'])
    
    # --- Domain knowledge based features ---
    # Age-adjusted hyperactivity score (if age is available)
    if 'MRI_Track_Age_at_Scan' in df.columns:
        df['Age_Adjusted_Hyperactivity'] = df['SDQ_SDQ_Hyperactivity'] / (df['MRI_Track_Age_at_Scan'] + epsilon)
    
    # Enrollment year effects (if available)
    if 'Basic_Demos_Enroll_Year' in df.columns:
        df['Relative_Enroll_Year'] = df['Basic_Demos_Enroll_Year'] - df['Basic_Demos_Enroll_Year'].min()
        # Optionally create bins (for example, Early, Mid, Late)
        df['Enroll_Year_Bin'] = pd.cut(df['Basic_Demos_Enroll_Year'], bins=3, labels=[1, 2, 3])
    
    # Interaction between parenting score and composite SDQ score
    if 'APQ_P_APQ_P_ID' in df.columns:
        df['Parent_Behavior_Interaction'] = df['APQ_P_APQ_P_ID'] * df['Composite_SDQ']
    
    return df

# Apply feature engineering to both training and test sets
# train_combined = add_new_features(train_combined)
# test_combined = add_new_features(test_combined)

In [None]:
# Scale the features (we use StandardScaler)
scaler = StandardScaler()
train_combined = pd.DataFrame(scaler.fit_transform(train_combined), 
                              columns=train_combined.columns, index=train_combined.index)
test_combined = pd.DataFrame(scaler.transform(test_combined), 
                             columns=test_combined.columns, index=test_combined.index)

In [None]:
# %% [code]
# Define target variables
y_adhd = labels['ADHD_Outcome']
y_sex = labels['Sex_F']
# Create a combined string indicator (e.g., "11" for female ADHD) for later use if needed
combinations = y_adhd.astype(str) + y_sex.astype(str)

In [None]:
# Define features for ADHD prediction (including newly engineered features)
features_adhd = [
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Difficulties_Total', 
    'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Internalizing', 
    'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Prosocial', 
    'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_ID',
    #### New Features ####
    'I_Hyper_External', 'Diff_External_Internal', 'Ratio_External_Internal', 
    'Composite_SDQ', 'Log_Composite_SDQ', 'Enroll_Year_Bin', 'Relative_Enroll_Year',
    'Parent_Behavior_Interaction'
]
interactions = ['SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Peer_Problems', 'Basic_Demos_Enroll_Year', 'APQ_P_APQ_P_ID']

features_sex = [
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Prosocial', 'SDQ_SDQ_Externalizing', 
    'SDQ_SDQ_Emotional_Problems', 'ColorVision_CV_Score', 'APQ_P_APQ_P_PP', 
    'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'SDQ_SDQ_Internalizing', 'adhd_proba'
] + [f"I_{feat}" for feat in interactions] + ['Parent_Behavior_Interaction', 'Relative_Enroll_Year']

In [None]:
# %% [code]
# Define a custom threshold optimizer that wraps a base classifier.
from sklearn.base import BaseEstimator, ClassifierMixin, clone

class ThresholdOptimizer(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator, threshold=0.5, optimize=True):
        self.base_estimator = base_estimator
        self.threshold = threshold
        self.optimize = optimize

    def fit(self, X, y):
        self.base_estimator_ = clone(self.base_estimator)
        self.base_estimator_.fit(X, y)
        if self.optimize:
            y_pred_proba = self.base_estimator_.predict_proba(X)[:, 1]
            best_thresh = 0.5
            best_score = 0
            # Sweep through a range of thresholds to maximize F1 score
            for t in np.linspace(0.1, 0.9, 81):
                score = f1_score(y, (y_pred_proba > t).astype(int))
                if score > best_score:
                    best_score = score
                    best_thresh = t
            self.threshold = best_thresh
            print(f"Optimized threshold: {self.threshold:.2f} (F1: {best_score:.4f})")
        return self

    def predict(self, X):
        proba = self.base_estimator_.predict_proba(X)[:, 1]
        return (proba > self.threshold).astype(int)

    def predict_proba(self, X):
        return self.base_estimator_.predict_proba(X)

In [None]:
# Define a StratifiedKFold for tuning.
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

# Pipeline for ADHD prediction.
pipeline_adhd = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ThresholdOptimizer(
        base_estimator=LogisticRegression(
            solver='saga',
            class_weight='balanced',
            random_state=SEED,
            fit_intercept=True
        ),
        optimize=True
    ))
])

# Parameter grid for ADHD pipeline (tuning LogisticRegression hyperparameters)
param_grid_adhd = {
    'model__base_estimator__C': [0.01, 0.1, 1, 10, 100],
    'model__base_estimator__penalty': ['l1', 'l2']
}

grid_search_adhd = GridSearchCV(
    pipeline_adhd,
    param_grid=param_grid_adhd,
    scoring='f1',
    cv=skf,
    n_jobs=-1
)

In [None]:
# Fit ADHD model using training data
grid_search_adhd.fit(train_combined[features_adhd], y_adhd)
print("Best ADHD model parameters:", grid_search_adhd.best_params_)
print("Best cross-validated F1 for ADHD:", grid_search_adhd.best_score_)
print("Optimized ADHD threshold:", grid_search_adhd.best_estimator_.named_steps['model'].threshold)

In [None]:
# Predict ADHD probabilities on train and test data using the tuned ADHD pipeline.
train_adhd_proba = grid_search_adhd.predict_proba(train_combined[features_adhd])[:, 1]
test_adhd_proba = grid_search_adhd.predict_proba(test_combined[features_adhd])[:, 1]

# For later use in the Sex model, add the ADHD probability as a new feature.
train_combined['adhd_proba'] = train_adhd_proba
test_combined['adhd_proba'] = test_adhd_proba

# Also create interaction features for Sex prediction based on ADHD probability.
for feat in interactions:
    train_combined[f"I_{feat}"] = train_combined[feat] * train_combined["adhd_proba"]
    test_combined[f"I_{feat}"] = test_combined[feat] * test_combined["adhd_proba"]

In [None]:
# %% [code]
# Build a similar pipeline for Sex prediction.
pipeline_sex = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ThresholdOptimizer(
        base_estimator=LogisticRegression(
            solver='saga',
            class_weight='balanced',
            random_state=SEED,
            fit_intercept=True
        ),
        optimize=True
    ))
])

# Parameter grid for Sex pipeline.
param_grid_sex = {
    'model__base_estimator__C': [0.01, 0.1, 1, 10, 100],
    'model__base_estimator__penalty': ['l1', 'l2']
}

grid_search_sex = GridSearchCV(
    pipeline_sex,
    param_grid=param_grid_sex,
    scoring='f1',
    cv=skf,
    n_jobs=-1
)

In [None]:
# Fit Sex model using the updated train data (including ADHD probability and interactions)
grid_search_sex.fit(train_combined[features_sex], y_sex)
print("Best Sex model parameters:", grid_search_sex.best_params_)
print("Best cross-validated F1 for Sex:", grid_search_sex.best_score_)
print("Optimized Sex threshold:", grid_search_sex.best_estimator_.named_steps['model'].threshold)

In [None]:
# %% [code]
# Predict Sex probabilities on the test set.
test_sex_proba = grid_search_sex.predict_proba(test_combined[features_sex])[:, 1]

# For submission, you can produce binary predictions based on the optimized thresholds:
final_adhd_preds = grid_search_adhd.predict(test_combined[features_adhd])
final_sex_preds = grid_search_sex.predict(test_combined[features_sex])

In [None]:
# %% [code]
# Create a submission DataFrame using test_combined index and the predictions.
submission = pd.DataFrame({
    "participant_id": test_combined.index,
    "ADHD_Outcome": final_adhd_preds,  # or use test_adhd_proba if probability submission is desired
    "Sex_F": final_sex_preds           # or use test_sex_proba if probability submission is desired
})

# Save the submission file
submission.to_csv("submission1.csv", index=False)
print("Submission file 'submission.csv' created successfully!")