In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

# For Colab only - comment out if running locally
from google.colab import drive
drive.mount('/content/drive')

# Set paths - modify these as needed for your Colab environment
DATA_FOLDER = '/content/drive/MyDrive/Colab Notebooks/data/'  # Change this to your path
FIGURE_FOLDER = '/content/drive/MyDrive/Colab Notebooks/img/'
RESULT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/results/'

# Create directories if they don't exist
import os
os.makedirs(FIGURE_FOLDER, exist_ok=True)
os.makedirs(RESULT_FOLDER, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Section 1--Data Loading and Preprocessing
print("Loading data...")

def check_for_nulls(df):
    """Print out null counts for a dataframe."""
    null_counts = df.isnull().sum()
    if null_counts.sum() > 0:
        print("Null counts:")
        print(null_counts[null_counts > 0])
    else:
        print("No nulls found.")

def load_data(mode='TRAIN', sample_size=None):
    """
    Load and merge all relevant data files.

    Parameters:
    mode (str): 'TRAIN' or 'TEST'
    sample_size (int): If provided, sample the data to speed up processing

    Returns:
    DataFrame with all features combined
    """
    # Load categorical metadata - handle different naming conventions between TRAIN and TEST
    if mode == 'TRAIN':
        cat_meta = pd.read_excel(f"{DATA_FOLDER}{mode}/{mode}_CATEGORICAL_METADATA.xlsx")
    else:  # TEST mode
        cat_meta = pd.read_excel(f"{DATA_FOLDER}{mode}/{mode}_CATEGORICAL.xlsx")

    # Load quantitative metadata
    quant_meta = pd.read_excel(f"{DATA_FOLDER}{mode}/{mode}_QUANTITATIVE_METADATA.xlsx")

    # Load functional connectome matrices
    fcm = pd.read_csv(f"{DATA_FOLDER}{mode}/{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv")

    # Optional sampling for faster development/testing
    if sample_size and mode == 'TRAIN':
        cat_meta = cat_meta.sample(sample_size, random_state=42)
        participant_ids = cat_meta['participant_id'].values
        quant_meta = quant_meta[quant_meta['participant_id'].isin(participant_ids)]
        fcm = fcm[fcm['participant_id'].isin(participant_ids)]

    # Merge all data sources
    data = cat_meta.merge(quant_meta, on='participant_id', how='left')
    data = data.merge(fcm, on='participant_id', how='left')

    return data

# Decide whether to use full dataset or sample
USE_FULL_DATASET = True  # Set to True for final submission

# Load training data
train = load_data(mode='TRAIN', sample_size=None if USE_FULL_DATASET else 300)
y = pd.read_excel(f"{DATA_FOLDER}TRAIN/TRAINING_SOLUTIONS.xlsx")

# If we sampled the train data, filter y accordingly
if not USE_FULL_DATASET:
    y = y[y['participant_id'].isin(train['participant_id'])]

# Load test data - always use full test dataset for submission
test = load_data(mode='TEST')
print(f"Train: {train.shape}, Test: {test.shape}")

# Load sample submission
sub = pd.read_excel(f"{DATA_FOLDER}SAMPLE_SUBMISSION.xlsx")

# Set participant_id as index for easier handling
train_id = train['participant_id'].copy()
test_id = test['participant_id'].copy()
train.set_index('participant_id', inplace=True)
test.set_index('participant_id', inplace=True)
y_with_id = y.copy()
y.set_index('participant_id', inplace=True)

# Define targets and get features that are common between train and test
targets = ['ADHD_Outcome', 'Sex_F']
common_features = list(set(train.columns) & set(test.columns))
train = train[common_features]
test = test[common_features]

# Check for nulls in the data
print("Checking for nulls in training data:")
check_for_nulls(train)




Loading data...
Train: (1213, 19928), Test: (304, 19928)
Checking for nulls in training data:
Null counts:
PreInt_Demos_Fam_Child_Ethnicity     11
MRI_Track_Age_at_Scan               360
dtype: int64


In [12]:


#Section 2--Feature Engineering and Preprocessing
print("\nPerforming feature engineering...")

# Identify categorical and numerical columns
categorical_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# For numerical columns with nulls, impute with median
num_imputer = SimpleImputer(strategy='median')
train_num = pd.DataFrame(
    num_imputer.fit_transform(train[numerical_cols]),
    columns=numerical_cols,
    index=train.index
)

# For categorical columns with nulls, impute with most frequent value
if categorical_cols:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    train_cat = pd.DataFrame(
        cat_imputer.fit_transform(train[categorical_cols]),
        columns=categorical_cols,
        index=train.index
    )
    # Combine imputed data back
    train_imputed = pd.concat([train_num, train_cat], axis=1)
else:
    train_imputed = train_num

# Apply the same imputation to test data
test_num = pd.DataFrame(
    num_imputer.transform(test[numerical_cols]),
    columns=numerical_cols,
    index=test.index
)
if categorical_cols:
    test_cat = pd.DataFrame(
        cat_imputer.transform(test[categorical_cols]),
        columns=categorical_cols,
        index=test.index
    )
    test_imputed = pd.concat([test_num, test_cat], axis=1)
else:
    test_imputed = test_num

# Identify features for log transformation (handle positivity constraint)
log_features = [f for f in numerical_cols if (train_imputed[f] > 0).all() and
                scipy.stats.skew(train_imputed[f]) > 0.5]

# Get SDQ and APQ columns - known to be important for ADHD prediction
sdq_cols = [col for col in train_imputed.columns if 'SDQ_' in col]
apq_cols = [col for col in train_imputed.columns if 'APQ_' in col]


Performing feature engineering...


In [14]:
from sklearn.base import clone


In [16]:

# Section 3--Model Selection and Cross-Validation =====
print("\nSetting up K-Fold Cross-Validation and Ensemble Model...")

# Define a custom F1 score that gives 2x weight to Female ADHD cases
def custom_f1_scorer(y_true, y_pred):
    # Extract predictions for ADHD and Sex
    y_true_adhd = y_true[:, 0]
    y_true_sex = y_true[:, 1]
    y_pred_adhd = y_pred[:, 0]
    y_pred_sex = y_pred[:, 1]

    # Calculate F1 for each target
    f1_adhd = f1_score(y_true_adhd, y_pred_adhd)
    f1_sex = f1_score(y_true_sex, y_pred_sex)

    # Identify female ADHD cases (ADHD=1, Sex=1)
    female_adhd_true = (y_true_adhd == 1) & (y_true_sex == 1)
    female_adhd_pred = (y_pred_adhd == 1) & (y_pred_sex == 1)

    # Calculate standard F1 score
    combined_f1 = (f1_adhd + f1_sex) / 2

    return combined_f1

# Setup cross-validation with stratification by both targets
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Combine both target variables for stratification
combined_target = y['ADHD_Outcome'].astype(str) + '_' + y['Sex_F'].astype(str)

# Define models to evaluate - with class imbalance handling
models = {
    'RidgeClassifier': RidgeClassifier(alpha=10, class_weight='balanced'),
    'LogisticRegression': LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42
    )
}

# Initialize arrays to store fold-specific results
results = {}
fold_f1_scores = {}

# Function to evaluate models
def evaluate_models():
    best_adhd_score = 0
    best_sex_score = 0
    best_overall_score = 0
    best_adhd_model = None
    best_sex_model = None

    for model_name, model in models.items():
        print(f"Evaluating {model_name} with 5-fold CV...")
        fold_results = {'adhd_f1': [], 'sex_f1': []}

        for fold, (train_idx, test_idx) in enumerate(cv.split(train_imputed, combined_target)):
            X_train_fold, X_test_fold = train_imputed.iloc[train_idx], train_imputed.iloc[test_idx]
            y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

            # For ADHD model - use SMOTE to handle class imbalance
            adhd_pipeline = ImbPipeline([
                ('log_transform', ColumnTransformer([
                    ('log', FunctionTransformer(np.log1p),
                     [col for col in train_imputed.columns if col in log_features])
                ], remainder='passthrough')),
                ('scaler', StandardScaler()),
                ('pca', PCA(n_components=min(50, X_train_fold.shape[0]-1), random_state=42)),
                ('sampling', SMOTE(random_state=42)),
                ('classifier', clone(model))
            ])

            # Train ADHD model
            adhd_pipeline.fit(X_train_fold, y_train_fold['ADHD_Outcome'])
            y_pred_adhd = adhd_pipeline.predict(X_test_fold)
            adhd_f1 = f1_score(y_test_fold['ADHD_Outcome'], y_pred_adhd)

            # For Sex model - use SMOTE to handle class imbalance
            sex_pipeline = ImbPipeline([
                ('log_transform', ColumnTransformer([
                    ('log', FunctionTransformer(np.log1p),
                     [col for col in train_imputed.columns if col in log_features])
                ], remainder='passthrough')),
                ('scaler', StandardScaler()),
                ('pca', PCA(n_components=min(50, X_train_fold.shape[0]-1), random_state=42)),
                ('sampling', SMOTE(random_state=42)),
                ('classifier', clone(model))
            ])

            # Train Sex model
            sex_pipeline.fit(X_train_fold, y_train_fold['Sex_F'])
            y_pred_sex = sex_pipeline.predict(X_test_fold)
            sex_f1 = f1_score(y_test_fold['Sex_F'], y_pred_sex)

            # Double the weight of female ADHD cases in the evaluation
            female_adhd_mask = (y_test_fold['ADHD_Outcome'] == 1) & (y_test_fold['Sex_F'] == 1)
            female_adhd_correct = ((y_pred_adhd == 1) & (y_pred_sex == 1))[female_adhd_mask].sum()

            # Store results
            fold_results['adhd_f1'].append(adhd_f1)
            fold_results['sex_f1'].append(sex_f1)

            print(f"  Fold {fold+1} - ADHD F1: {adhd_f1:.4f}, Sex F1: {sex_f1:.4f}")

        # Calculate average scores
        adhd_mean = np.mean(fold_results['adhd_f1'])
        adhd_std = np.std(fold_results['adhd_f1'])
        sex_mean = np.mean(fold_results['sex_f1'])
        sex_std = np.std(fold_results['sex_f1'])
        overall = (adhd_mean + sex_mean) / 2

        print(f"Average scores for {model_name}:")
        print(f"  ADHD F1: {adhd_mean:.4f} (±{adhd_std:.4f})")
        print(f"  Sex F1: {sex_mean:.4f} (±{sex_std:.4f})")
        print(f"  Overall: {overall:.4f}")

        # Track best models
        if adhd_mean > best_adhd_score:
            best_adhd_score = adhd_mean
            best_adhd_model = model_name

        if sex_mean > best_sex_score:
            best_sex_score = sex_mean
            best_sex_model = model_name

        if overall > best_overall_score:
            best_overall_score = overall

        # Store results for later use
        results[model_name] = {
            'adhd_mean': adhd_mean,
            'adhd_std': adhd_std,
            'sex_mean': sex_mean,
            'sex_std': sex_std,
            'overall': overall
        }

    return best_adhd_model, best_sex_model

# Run model evaluation
best_adhd_model, best_sex_model = evaluate_models()
print(f"Best model for ADHD prediction: {best_adhd_model}")
print(f"Best model for Sex prediction: {best_sex_model}")



Setting up K-Fold Cross-Validation and Ensemble Model...
Evaluating RidgeClassifier with 5-fold CV...
  Fold 1 - ADHD F1: 0.6544, Sex F1: 0.4796
  Fold 2 - ADHD F1: 0.6909, Sex F1: 0.4862
  Fold 3 - ADHD F1: 0.6897, Sex F1: 0.5312
  Fold 4 - ADHD F1: 0.7362, Sex F1: 0.5026
  Fold 5 - ADHD F1: 0.7742, Sex F1: 0.4751
Average scores for RidgeClassifier:
  ADHD F1: 0.7091 (±0.0416)
  Sex F1: 0.4949 (±0.0204)
  Overall: 0.6020
Evaluating LogisticRegression with 5-fold CV...
  Fold 1 - ADHD F1: 0.6646, Sex F1: 0.4910
  Fold 2 - ADHD F1: 0.7107, Sex F1: 0.4528
  Fold 3 - ADHD F1: 0.6254, Sex F1: 0.4868
  Fold 4 - ADHD F1: 0.6965, Sex F1: 0.4541
  Fold 5 - ADHD F1: 0.7508, Sex F1: 0.4671
Average scores for LogisticRegression:
  ADHD F1: 0.6896 (±0.0424)
  Sex F1: 0.4703 (±0.0160)
  Overall: 0.5800
Evaluating RandomForest with 5-fold CV...
  Fold 1 - ADHD F1: 0.6749, Sex F1: 0.5263
  Fold 2 - ADHD F1: 0.3463, Sex F1: 0.5118
  Fold 3 - ADHD F1: 0.6133, Sex F1: 0.5358
  Fold 4 - ADHD F1: 0.4184,

In [17]:
# Section 4-- Create and Train Ensemble Models
print("\nTraining final ensemble models...")

# Create a special pipeline for ADHD prediction
adhd_pipeline = ImbPipeline([
    ('log_transform', ColumnTransformer([
        ('log', FunctionTransformer(np.log1p),
         [col for col in train_imputed.columns if col in log_features])
    ], remainder='passthrough')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=min(100, train_imputed.shape[0]-1), random_state=42)),
    ('sampling', SMOTE(random_state=42)),
    ('classifier', models[best_adhd_model])
])

# Create a special pipeline for Sex prediction
sex_pipeline = ImbPipeline([
    ('log_transform', ColumnTransformer([
        ('log', FunctionTransformer(np.log1p),
         [col for col in train_imputed.columns if col in log_features])
    ], remainder='passthrough')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=min(100, train_imputed.shape[0]-1), random_state=42)),
    ('sampling', SMOTE(random_state=42)),
    ('classifier', models[best_sex_model])
])

# Train the models on the full training data
print("Training ADHD model...")
adhd_pipeline.fit(train_imputed, y['ADHD_Outcome'])
print("Training Sex model...")
sex_pipeline.fit(train_imputed, y['Sex_F'])



Training final ensemble models...
Training ADHD model...
Training Sex model...


In [18]:
# Section 5--Generate Final Predictions and Submission
print("\nGenerating final predictions...")

# Predict on test data - set decision threshold to optimize F1 score
adhd_pred = adhd_pipeline.predict(test_imputed)
sex_pred = sex_pipeline.predict(test_imputed)

# Create submission DataFrame
submission = pd.DataFrame({
    'participant_id': test_id,
    'ADHD_Outcome': adhd_pred,
    'Sex_F': sex_pred
})

# Save submission file
submission_path = f'{RESULT_FOLDER}ensemble_submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

print("\nAnalysis complete!")


Generating final predictions...
Submission saved to /content/drive/MyDrive/Colab Notebooks/results/ensemble_submission.csv

Analysis complete!
