# 1. Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
sns.set_theme(style="whitegrid", palette="deep", font_scale=1.5)

SEED = 42

In [None]:
# train_q = pd.read_excel("data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")
# train_c = pd.read_excel("data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
# train_combined = pd.merge(train_q, train_c, on='participant_id', how='left').set_index("participant_id") 


# test_q = pd.read_excel("data/TEST/TEST_QUANTITATIVE_METADATA.xlsx")
# test_c = pd.read_excel("data/TEST/TEST_CATEGORICAL_METADATA.xlsx")
# test_combined = pd.merge(test_q, test_c, on='participant_id', how='left').set_index("participant_id")

train_combined = pd.read_csv("train_processed.csv").set_index("participant_id")
test_combined = pd.read_csv("test_processed.csv").set_index("participant_id")

labels = pd.read_excel("data/TRAIN/TRAINING_SOLUTIONS.xlsx").set_index("participant_id")

train_combined = train_combined.sort_index()
test_combined = test_combined.sort_index()
labels = labels.sort_index()

assert all(train_combined.index == labels.index), "error"

In [None]:
train_combined.head()

In [None]:
label_df = labels.copy()

# Plot: ADHD_Outcome
plt.figure(figsize=(6, 4))
sns.countplot(x=label_df["ADHD_Outcome"], palette="Set2")
plt.title("Distribution of ADHD_Outcome")
plt.xticks([0, 1], ['No ADHD (0)', 'ADHD (1)'])
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Plot: Sex_F
plt.figure(figsize=(6, 4))
sns.countplot(x=label_df["Sex_F"], palette="pastel")
plt.title("Distribution of Sex_F")
plt.xticks([0, 1], ['Male (0)', 'Female (1)'])
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# 2. EDA

In [None]:
print(train_combined.columns)
print(len(train_combined.columns))

In [None]:
import math

numerical_features = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP',
    'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD',
    'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
    'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
    'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing',
    'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan', 
]
print(len(numerical_features))

def _plot_numeric_classes(df, col, bins=10, hist=True, kde=True):
    if hist:
        sns.histplot(df[col], bins=bins, kde=False)
    if kde:
        sns.kdeplot(df[col], color="red")

def _distribution_numeric(df, numeric_cols, figsize=(12, 6), bins = 10):
    num_features = len(numeric_cols)

    ncols = 4
    nrows = math.ceil(num_features / ncols)

    plt.figure(figsize = (figsize[0], nrows * (figsize[1] / 3))) # Adjust figsize based on nrows
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=0.5)

    for i in range(num_features):
        plt.subplot(nrows, ncols, i + 1)
        _plot_numeric_classes(df, numeric_cols[i], bins = bins)
        plt.title(numeric_cols[i])

    plt.tight_layout() # Adjust layout to prevent titles overlapping
    plt.show()
    
_distribution_numeric(train_combined, numerical_features)

In [None]:
categorical_features = [
'Basic_Demos_Enroll_Year', 'PreInt_Demos_Fam_Child_Ethnicity',
'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location',
'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ',
'Laterality_Category'
]

print(len(categorical_features))

def _plot_bar_classes(df, cols):
    df[cols].value_counts().plot.bar()

def _distribution_cate(df, cate_cols, row = 1, col = 2, figsize = (20, 5)):
  plt.figure(figsize = figsize)
  plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=0.5)    
  for i in range(1, len(cate_cols)+1, 1):
    try:
      plt.subplot(row, col, i)
      _plot_bar_classes(df, cate_cols[i-1])
      plt.title(cate_cols[i-1])
    except:
      break

# _distribution_cate(train_combined, categorical_features, row = 3, col = 3, figsize = (12, 10))

In [None]:
train_combined.isnull().sum().sum()
test_combined.isnull().sum().sum()  

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

num_pipeline = Pipeline(
    steps=[
        ('imputer', IterativeImputer(
            estimator=LassoCV(random_state=SEED),
            max_iter=100,
            random_state=SEED,
        )),
        ('scaler', StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features),
    ]
)

cleaned_train = preprocessor.fit_transform(train_combined)
# cleaned_test = preprocessor.transform(test_combined)
print("Train shape: ", cleaned_train.shape)
# print("Test shape: ", cleaned_test.shape)

In [None]:
from sklearn.model_selection import train_test_split

cleaned_train_df = pd.DataFrame(
    cleaned_train,
    columns=numerical_features + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)),
    index=train_combined.index
)


# 3. Modeling

PLEASE TRAIN A DEEP NEURAL NETWORK ON THIS DATASET

+ good weight initializations 

 + K-fold cross validation with k = 10, stratified repeated fold with n_repeats = 3  

 + Num epochs = 100 

 + batch size = 64 

 + L2 regularization with l2 lambda = 0.01 

 + dropout rate = 0.3 

 + also add early stopping with patience = 40 


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")
SEED = 42
tf.random.set_seed(SEED) # Set TF seed for reproducibility
np.random.seed(SEED) # Set numpy seed for other operations

X = cleaned_train_df.copy()

y = labels.loc[X.index, ["ADHD_Outcome", "Sex_F"]].copy()

stratify_y = y['ADHD_Outcome'].astype(str) + '_' + y['Sex_F'].astype(str)

X_np = X.values
y_np = y.values
stratify_y_np = stratify_y.values

print("Shape of X_np (features):", X_np.shape)
print("Shape of y_np (labels):", y_np.shape)
print("Shape of stratify_y_np (stratification key):", stratify_y_np.shape)

In [None]:
def calculate_fold_metric_corrected(y_true_fold, y_pred_proba_fold):
    """
    Calculates the average of weighted F1 for ADHD and standard F1 for Sex_F for a fold.
    Assumes y_true_fold has shape (N_fold, 2) with columns [ADHD_Outcome, Sex_F].
    Uses default threshold of 0.5 for binary predictions.
    """
    y_pred_binary_fold = (y_pred_proba_fold > 0.5).astype(int)

    y_true_adhd_fold = y_true_fold[:, 0]
    y_pred_adhd_fold = y_pred_binary_fold[:, 0]
    y_true_sex_fold = y_true_fold[:, 1]
    y_pred_sex_fold = y_pred_binary_fold[:, 1]

    # adjusted sample weights
    adhd_sample_weights = np.where((y_true_adhd_fold == 1) & (y_true_sex_fold == 1), 2, 1)

    adhd_weighted_f1 = f1_score(y_true_adhd_fold, y_pred_adhd_fold, sample_weight=adhd_sample_weights, zero_division=0)
    sex_f_macro_f1 = f1_score(y_true_sex_fold, y_pred_sex_fold, average='macro', zero_division=0)

    average_competition_score = (adhd_weighted_f1 + sex_f_macro_f1) / 2
    
    return average_competition_score, adhd_weighted_f1, sex_f_macro_f1


def create_dnn_model(input_dim, l2_lambda=0.01, dropout_rate=0.3):
    model = Sequential([
        Input(shape=(input_dim,)),

        Dense(128, activation='relu', kernel_regularizer=l2(l2_lambda)),
        Dropout(dropout_rate),

        Dense(64, activation='relu', kernel_regularizer=l2(l2_lambda)),
        Dropout(dropout_rate),

        Dense(2, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'f1_score']) 

    return model

In [None]:
# --- K-fold Cross-Validation Setup ---

n_splits = 10      # Number of folds
n_repeats = 3      # Number of repetitions
epochs = 100       # Max epochs per fold
batch_size = 64    # Batch size
l2_lambda = 0.01   # L2 regularization strength
dropout_rate = 0.3 # Dropout rate
es_patience = 40   # Early stopping patience

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=SEED)

# Lists to store metrics and OOF predictions from CV
fold_competition_scores = []
fold_adhd_weighted_f1s = []
fold_sex_f_macro_f1s = []
fold_epochs_trained = []

# Lists to collect OOF predictions and corresponding true labels for threshold optimization
oof_preds_list = []
oof_y_true_list = []

print(f"\n--- Starting Repeated Stratified K-Fold Cross-Validation ({n_repeats} repeats of {n_splits} folds) ---")
fold_count = 0

# rskf.split yields integer indices referencing the rows of X_np and y_np
for train_index, val_index in rskf.split(X_np, stratify_y_np):
    fold_count += 1
    print(f"\n--- Fold {fold_count}/{n_splits * n_repeats} ---")

    # Split data for the current fold using integer indices
    X_train_fold, X_val_fold = X_np[train_index], X_np[val_index]
    y_train_fold, y_val_fold = y_np[train_index], y_np[val_index] # y_np has shape (N, 2)

    # Create a new model instance for each fold to ensure fresh weights
    model = create_dnn_model(input_dim=X_train_fold.shape[1], l2_lambda=l2_lambda, dropout_rate=dropout_rate)

    # Define Early Stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=es_patience, restore_best_weights=True)

    print(f"Training model for Fold {fold_count}...")
    # Train the model on the training portion of the fold
    history = model.fit(X_train_fold, y_train_fold,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(X_val_fold, y_val_fold), # Evaluate on the validation portion
                        callbacks=[early_stopping],
                        verbose=0) # Set verbose=1 to see training progress per epoch

    epochs_ran = len(history.history['loss'])
    print(f"Finished training for Fold {fold_count}. Epochs trained: {epochs_ran}")
    fold_epochs_trained.append(epochs_ran)

    # --- Evaluate on the Validation Fold using the Competition Metric (Default Threshold 0.5) ---
    print("Evaluating on validation fold (using default 0.5 threshold)...")
    # Predict probabilities on the validation fold data
    val_preds_proba = model.predict(X_val_fold)

    # Calculate the custom competition metric with default 0.5 threshold
    fold_comp_score, fold_adhd_w_f1, fold_sex_f_f1 = calculate_fold_metric_corrected(
        y_val_fold,      # Pass true labels for the fold (contains ADHD and Sex_F)
        val_preds_proba  # Pass predicted probabilities for the fold
    )

    print(f"Fold {fold_count} Metrics (Default 0.5 Threshold):")
    print(f"  Weighted ADHD F1: {fold_adhd_w_f1:.4f}")
    print(f"  Sex_F Macro F1:   {fold_sex_f_f1:.4f}")
    print(f"  Competition Avg F1: {fold_comp_score:.4f}")

    # Store fold metrics
    fold_competition_scores.append(fold_comp_score)
    fold_adhd_weighted_f1s.append(fold_adhd_w_f1)
    fold_sex_f_macro_f1s.append(fold_sex_f_f1)

    # --- Collect OOF Predictions and True Labels ---
    # Store the predictions and true labels for this validation fold
    oof_preds_list.append(val_preds_proba)
    oof_y_true_list.append(y_val_fold)


print("\n--- Cross-Validation Complete ---")

In [None]:
# --- Aggregate OOF Predictions and True Labels ---
# Concatenate predictions and true labels from all validation folds
# Note: With Repeated KFold, samples appear multiple times. This is fine for
# threshold optimization, but you'd handle duplicates if building a single OOF
# prediction set for ensembling or final evaluation.
all_oof_preds = np.concatenate(oof_preds_list, axis=0)
all_oof_y_true = np.concatenate(oof_y_true_list, axis=0)

print(f"\nAggregated OOF predictions shape: {all_oof_preds.shape}")
print(f"Aggregated OOF true labels shape: {all_oof_y_true.shape}")


avg_comp_score_default_thresh = np.mean(fold_competition_scores)
std_comp_score_default_thresh = np.std(fold_competition_scores)
avg_adhd_w_f1_default_thresh = np.mean(fold_adhd_weighted_f1s)
std_adhd_w_f1_default_thresh = np.std(fold_adhd_weighted_f1s)
avg_sex_f_f1_default_thresh = np.mean(fold_sex_f_macro_f1s)
std_sex_f_f1_default_thresh = np.std(fold_sex_f_macro_f1s)
avg_epochs = np.mean(fold_epochs_trained)

print("\n--- Cross-Validation Results Summary (Default 0.5 Threshold) ---")
print(f"Total Folds Run: {fold_count}")
print(f"Average Epochs Trained per Fold: {avg_epochs:.1f}")
print("\nAverage Metrics Across All Folds:")
print(f"  Weighted ADHD F1: {avg_adhd_w_f1_default_thresh:.4f} +/- {std_adhd_w_f1_default_thresh:.4f}")
print(f"  Sex_F Macro F1:   {avg_sex_f_f1_default_thresh:.4f} +/- {std_sex_f_f1_default_thresh:.4f}")
print(f"  Competition Avg F1: {avg_comp_score_default_thresh:.4f} +/- {std_comp_score_default_thresh:.4f}")

In [None]:
# --- Threshold Optimization on OOF Predictions ---

print("\n--- Starting Threshold Optimization on OOF Data ---")

# Define the metric function for OOF data using specific thresholds
def calculate_oof_metric_with_thresholds(y_true_oof, y_pred_proba_oof, adhd_threshold, sex_f_threshold):
    """
    Calculates the competition metric on OOF data using specific thresholds.
    """
    # Create a binary prediction array based on the specified thresholds
    y_pred_binary_oof = np.zeros_like(y_pred_proba_oof, dtype=int)
    y_pred_binary_oof[:, 0] = (y_pred_proba_oof[:, 0] > adhd_threshold).astype(int) # Threshold for ADHD
    y_pred_binary_oof[:, 1] = (y_pred_proba_oof[:, 1] > sex_f_threshold).astype(int) # Threshold for Sex_F

    # Extract true and predicted labels for each outcome
    y_true_adhd = y_true_oof[:, 0]
    y_pred_adhd = y_pred_binary_oof[:, 0]
    y_true_sex = y_true_oof[:, 1]
    y_pred_sex = y_pred_binary_oof[:, 1]

    # Weighted F1 for ADHD (uses true Sex_F label for weight)
    adhd_sample_weights = np.where((y_true_adhd == 1) & (y_true_sex == 1), 2, 1)
    adhd_weighted_f1 = f1_score(y_true_adhd, y_pred_adhd, sample_weight=adhd_sample_weights, zero_division=0)

    # Macro F1 for Sex_F
    sex_f_macro_f1 = f1_score(y_true_sex, y_pred_sex, average='macro', zero_division=0)

    # Competition metric is the average of the two F1 scores
    return (adhd_weighted_f1 + sex_f_macro_f1) / 2

# Grid search for the best thresholds
# Search range: from 0.05 to 0.95 with a step (adjust range and step as needed for thoroughness vs speed)
threshold_range = np.arange(0.05, 0.96, 0.01) # Example range, adjust if necessary

best_oof_score = -1 # Initialize with a score lower than any possible F1
best_adhd_thresh = 0.5 # Default starting point
best_sex_f_thresh = 0.5 # Default starting point

print(f"Searching thresholds from {threshold_range[0]:.2f} to {threshold_range[-1]:.2f} with step {threshold_range[1]-threshold_range[0]:.2f}...")

In [None]:
# Iterate through all combinations of thresholds
# Warning: This nested loop can take significant time depending on the size of threshold_range
for adhd_thresh in threshold_range:
    for sex_f_thresh in threshold_range:
        # Calculate the competition score for the current threshold pair on OOF data
        score = calculate_oof_metric_with_thresholds(all_oof_y_true, all_oof_preds, adhd_thresh, sex_f_thresh)

        # Update best score and thresholds if current score is higher
        if score > best_oof_score:
            best_oof_score = score
            best_adhd_thresh = adhd_thresh
            best_sex_f_thresh = sex_f_thresh

print("\n--- Threshold Optimization Complete ---")
print(f"Best OOF Competition Score Found: {best_oof_score:.4f}")
print(f"Optimal ADHD Threshold: {best_adhd_thresh:.4f}")
print(f"Optimal Sex_F Threshold: {best_sex_f_thresh:.4f}")
print(f"Improvement over Default 0.5 Threshold: {best_oof_score - avg_comp_score_default_thresh:.4f}")

In [None]:
cleaned_test = preprocessor.transform(test_combined)

num_cols_out = [col for col in numerical_features] 
cat_transformer = preprocessor.named_transformers_['cat']
cat_cols_out = cat_transformer.named_steps['onehot'].get_feature_names_out(categorical_features)

all_cols_out = list(num_cols_out) + list(cat_cols_out)

cleaned_test_df = pd.DataFrame(
    cleaned_test,
    columns=all_cols_out,
    index=test_combined.index # Use the original test participant_ids as index
)

X_test_np = cleaned_test_df.values

print("Shape of preprocessed test data (cleaned_test_df):", cleaned_test_df.shape)
print("Shape of test data numpy array (X_test_np):", X_test_np.shape)

In [None]:
# --- Train Final Model on Entire Training Data ---

print("\n--- Training Final Model on Entire Training Data ---")

# Create the final model instance with the same architecture and hyperparameters
final_model = create_dnn_model(input_dim=X_np.shape[1], l2_lambda=l2_lambda, dropout_rate=dropout_rate)

# Train for the average number of epochs found during CV
# Use np.ceil to round up, ensure at least 1 epoch
final_epochs = max(1, int(np.ceil(avg_epochs)))

print(f"Training final model for {final_epochs} epochs on the entire training dataset ({X_np.shape[0]} samples)...")

# Train the model on the full training data (features X_np, labels y_np)
# Use verbose=1 to show the training progress
final_model.fit(X_np, y_np,
                epochs=final_epochs,
                batch_size=batch_size, # Use the same batch size as in CV
                verbose=1)

test_predictions_proba = final_model.predict(X_test_np)

# test_predictions_proba has shape (Number of test samples, 2)
# Column 0 is the predicted probability for ADHD_Outcome=1
# Column 1 is the predicted probability for Sex_F=1
print("Shape of test predictions (probabilities):", test_predictions_proba.shape)

In [None]:
# Assume all previous code including threshold optimization is executed,
# and best_adhd_thresh and best_sex_f_thresh are available.
# Assume test_predictions_proba is available from model.predict(X_test_np).
# Assume test_combined is available with participant_id as index.

# --- Create Submission File (using BINARY predictions with optimal thresholds) ---

print("\n--- Creating Submission File (Binary Predictions) ---")

# Get the participant_ids from the original test data (its index)
test_participant_ids = test_combined.index

# --- Apply the optimal thresholds to convert probabilities to binary predictions ---
# These thresholds (best_adhd_thresh, best_sex_f_thresh) were found
# by optimizing the competition metric on the OOF validation data.
test_predictions_binary = np.zeros_like(test_predictions_proba, dtype=int)

# Apply the optimal threshold for ADHD predictions
test_predictions_binary[:, 0] = (test_predictions_proba[:, 0] > best_adhd_thresh).astype(int)

# Apply the optimal threshold for Sex_F predictions
test_predictions_binary[:, 1] = (test_predictions_proba[:, 1] > best_sex_f_thresh).astype(int)

# --- Create a Pandas DataFrame for the submission file ---
# The submission format now requires binary values (0 or 1) for ADHD_Outcome and Sex_F
submission_df = pd.DataFrame({
    'participant_id': test_participant_ids,
    'ADHD_Outcome': test_predictions_binary[:, 0], # Use the binarized ADHD predictions
    'Sex_F': test_predictions_binary[:, 1]        # Use the binarized Sex_F predictions
})

# Ensure the columns are in the exact required order: participant_id,ADHD_Outcome,Sex_F
submission_df = submission_df[['participant_id', 'ADHD_Outcome', 'Sex_F']]

# Print the head of the submission DataFrame to verify the format
# It should now show 0s and 1s in the prediction columns.
print("\nSubmission file head (Binary Predictions):")
print(submission_df.head())

# --- Save the submission DataFrame to a CSV file ---
submission_filename = "submission.csv"
# Use index=False to prevent writing the DataFrame index as a column
# The default separator for .to_csv is comma, which is correct.
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file successfully created with binary predictions and saved as '{submission_filename}'")

# Note: The raw probabilities from model.predict() were thresholded using
# the optimal thresholds found on the OOF validation data to produce these
# binary outputs for the submission file.