1. Load Data
2. Handling Missing Values
3. Handling Outliers for Numerical Vars (to-do)
4. One Hot Encoding for Categorical Vars
5. PCA for MRI columns
6. Feature Selection Using Lasso
7. Modeling
8. ...

In [60]:
import numpy as np 
import pandas as pd
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from gc import collect
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load Data

In [61]:
# Notebook from: https://www.kaggle.com/code/anugarania/wids-2025-eda-step-by-step-workshop-atx-nyc
# Function to load all data
def get_feats(mode='train'):
    
    # Load quantitative metadata
    feats = pd.read_excel(f"../Dataset/widsdatathon2025/{mode}/{mode}_QUANTITATIVE_METADATA.xlsx")
    
    # Load categorical metadata with the correct filename depending on mode
    if mode == 'TRAIN':
        cate = pd.read_excel(f"../Dataset/widsdatathon2025/{mode}/{mode}_CATEGORICAL_METADATA.xlsx")
    else:
        cate = pd.read_excel(f"../Dataset/widsdatathon2025/{mode}/{mode}_CATEGORICAL.xlsx")
    
    # Merge categorical data
    feats = feats.merge(cate, on='participant_id', how='left')
    
    # Load functional connectome matrices
    func = pd.read_csv(f"../Dataset/widsdatathon2025/{mode}/{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv")
    feats = feats.merge(func, on='participant_id', how='left')
    
    # If training data, merge with solution file
    if mode == 'TRAIN':
        solution = pd.read_excel("../Dataset/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
        feats = feats.merge(solution, on='participant_id', how='left')
    
    return feats

In [62]:
# Load data
train = get_feats(mode='TRAIN')
test = get_feats(mode='TEST')

# Set index
train.set_index('participant_id', inplace=True)
test.set_index('participant_id', inplace=True)

# Define targets and features
targets = ['ADHD_Outcome', 'Sex_F']

sub = pd.read_excel('../Dataset/widsdatathon2025/SAMPLE_SUBMISSION.xlsx')
y = pd.read_excel('../Dataset/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx')

In [63]:
print("\nDataset Overview:")
print(f"Training data: {train.shape[0]} participants, {train.shape[1]} features")
print(f"Test data: {test.shape[0]} participants, {test.shape[1]} features")


Dataset Overview:
Training data: 1213 participants, 19929 features
Test data: 304 participants, 19927 features


# Data Pre-processing

In [64]:
## Handling Missing Values
def check_missing(dataset):
    missing = dataset.isnull().sum()
    missing_percent = 100 * missing / len(dataset)
    missing_df = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_percent
        })
    # Display features with missing values
    missing_features = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)
    print(f"\nFeatures with missing values:")
    return missing_features

In [65]:
# Check for missing values in the training data
check_missing(train)


Features with missing values:


Unnamed: 0,Missing Values,Percentage
MRI_Track_Age_at_Scan,360,29.678483
PreInt_Demos_Fam_Child_Ethnicity,11,0.906843


In [66]:
def fill_missing(dataset, columns, impute_method, fill_value=None):
    imputer = SimpleImputer(strategy=impute_method, fill_value=fill_value)
    imputer.fit(dataset[columns])
    dataset[columns] = imputer.transform(dataset[columns])

In [67]:
fill_missing(train, ['MRI_Track_Age_at_Scan'], 'mean') # training set, numeric var
fill_missing(train, ['PreInt_Demos_Fam_Child_Ethnicity'], 'constant', 3) # 3=unknown

In [68]:
# Check for missing values in the test data
check_missing(test)


Features with missing values:


Unnamed: 0,Missing Values,Percentage
Barratt_Barratt_P2_Occ,42,13.815789
Barratt_Barratt_P2_Edu,36,11.842105
SDQ_SDQ_Difficulties_Total,30,9.868421
SDQ_SDQ_Prosocial,30,9.868421
SDQ_SDQ_Peer_Problems,30,9.868421
SDQ_SDQ_Internalizing,30,9.868421
SDQ_SDQ_Hyperactivity,30,9.868421
SDQ_SDQ_Generating_Impact,30,9.868421
SDQ_SDQ_Emotional_Problems,30,9.868421
SDQ_SDQ_Externalizing,30,9.868421


In [69]:
# Categorical columns - use a new category "unknown", if not existing,  to fill in missing data
categorical_columns = [
    'Barratt_Barratt_P1_Edu', 
    'Barratt_Barratt_P2_Edu',
    'Barratt_Barratt_P1_Occ',
    'Barratt_Barratt_P2_Occ'
    # 'PreInt_Demos_Fam_Child_Race'，
]

fill_missing(test, categorical_columns, 'constant', 99) # create a new category 99==unknwn
fill_missing(test, ['PreInt_Demos_Fam_Child_Race'], 'constant', 10) # 10=unknown
fill_missing(test, ['PreInt_Demos_Fam_Child_Ethnicity'], 'constant', 3) # 3=unknown


# Numerical columns - we'll use the average value to fill in missing data
numerical_columns = [
    'SDQ_SDQ_Difficulties_Total',
    'SDQ_SDQ_Prosocial',
    'SDQ_SDQ_Peer_Problems',
    'SDQ_SDQ_Internalizing',
    'SDQ_SDQ_Hyperactivity',
    'SDQ_SDQ_Generating_Impact',
    'SDQ_SDQ_Emotional_Problems',
    'SDQ_SDQ_Externalizing',
    'SDQ_SDQ_Conduct_Problems',
    'APQ_P_APQ_P_PP',
    'APQ_P_APQ_P_PM',
    'APQ_P_APQ_P_OPD',
    'APQ_P_APQ_P_INV',
    'APQ_P_APQ_P_ID',
    'APQ_P_APQ_P_CP',
    'ColorVision_CV_Score',
    'EHQ_EHQ_Total'
]


fill_missing(test, numerical_columns, 'mean')

In [70]:
check_missing(train)
check_missing(test)


Features with missing values:

Features with missing values:


Unnamed: 0,Missing Values,Percentage


In [71]:
# Get dummies for categorical variables
cate_vars = [
    'Barratt_Barratt_P1_Edu', 
    'Barratt_Barratt_P2_Edu',
    'Barratt_Barratt_P1_Occ',
    'Barratt_Barratt_P2_Occ',
    'PreInt_Demos_Fam_Child_Race',
    'PreInt_Demos_Fam_Child_Ethnicity',
    'Basic_Demos_Study_Site',
    'MRI_Track_Scan_Location'
]

train[cate_vars].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1213 entries, UmrK0vMLopoR to k8HhHnnu2wmt
Data columns (total 8 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Barratt_Barratt_P1_Edu            1213 non-null   int64  
 1   Barratt_Barratt_P2_Edu            1213 non-null   int64  
 2   Barratt_Barratt_P1_Occ            1213 non-null   int64  
 3   Barratt_Barratt_P2_Occ            1213 non-null   int64  
 4   PreInt_Demos_Fam_Child_Race       1213 non-null   int64  
 5   PreInt_Demos_Fam_Child_Ethnicity  1213 non-null   float64
 6   Basic_Demos_Study_Site            1213 non-null   int64  
 7   MRI_Track_Scan_Location           1213 non-null   int64  
dtypes: float64(1), int64(7)
memory usage: 85.3+ KB


In [72]:
# Convert categorical variables to dummies
train_encoded = pd.get_dummies(train, columns=cate_vars, dtype='int64', drop_first=True)
test_encoded = pd.get_dummies(test, columns=cate_vars, dtype='int64', drop_first=True)

In [73]:
print('dummies in trainset:')
print([var for var in train_encoded.columns if not var in train.columns])

print('dummies in testset:')
print([var for var in test_encoded.columns if not var in test.columns])

dummies in trainset:
['Barratt_Barratt_P1_Edu_3', 'Barratt_Barratt_P1_Edu_6', 'Barratt_Barratt_P1_Edu_9', 'Barratt_Barratt_P1_Edu_12', 'Barratt_Barratt_P1_Edu_15', 'Barratt_Barratt_P1_Edu_18', 'Barratt_Barratt_P1_Edu_21', 'Barratt_Barratt_P2_Edu_3', 'Barratt_Barratt_P2_Edu_6', 'Barratt_Barratt_P2_Edu_9', 'Barratt_Barratt_P2_Edu_12', 'Barratt_Barratt_P2_Edu_15', 'Barratt_Barratt_P2_Edu_18', 'Barratt_Barratt_P2_Edu_21', 'Barratt_Barratt_P1_Occ_5', 'Barratt_Barratt_P1_Occ_10', 'Barratt_Barratt_P1_Occ_15', 'Barratt_Barratt_P1_Occ_20', 'Barratt_Barratt_P1_Occ_25', 'Barratt_Barratt_P1_Occ_30', 'Barratt_Barratt_P1_Occ_35', 'Barratt_Barratt_P1_Occ_40', 'Barratt_Barratt_P1_Occ_45', 'Barratt_Barratt_P2_Occ_5', 'Barratt_Barratt_P2_Occ_10', 'Barratt_Barratt_P2_Occ_15', 'Barratt_Barratt_P2_Occ_20', 'Barratt_Barratt_P2_Occ_25', 'Barratt_Barratt_P2_Occ_30', 'Barratt_Barratt_P2_Occ_35', 'Barratt_Barratt_P2_Occ_40', 'Barratt_Barratt_P2_Occ_45', 'PreInt_Demos_Fam_Child_Race_1', 'PreInt_Demos_Fam_Child_R

In [74]:
# check if test and train have same columns
[var for var in train_encoded.columns if not var in test_encoded.columns]

['ADHD_Outcome',
 'Sex_F',
 'Barratt_Barratt_P1_Edu_3',
 'Barratt_Barratt_P1_Edu_6',
 'Barratt_Barratt_P1_Edu_9',
 'Barratt_Barratt_P1_Edu_12',
 'Barratt_Barratt_P1_Edu_15',
 'Barratt_Barratt_P1_Edu_18',
 'Barratt_Barratt_P1_Edu_21',
 'Barratt_Barratt_P2_Edu_3',
 'Barratt_Barratt_P2_Edu_6',
 'Barratt_Barratt_P2_Edu_9',
 'Barratt_Barratt_P2_Edu_12',
 'Barratt_Barratt_P2_Edu_15',
 'Barratt_Barratt_P2_Edu_18',
 'Barratt_Barratt_P2_Edu_21',
 'Barratt_Barratt_P1_Occ_5',
 'Barratt_Barratt_P1_Occ_10',
 'Barratt_Barratt_P1_Occ_15',
 'Barratt_Barratt_P1_Occ_20',
 'Barratt_Barratt_P1_Occ_25',
 'Barratt_Barratt_P1_Occ_30',
 'Barratt_Barratt_P1_Occ_35',
 'Barratt_Barratt_P1_Occ_40',
 'Barratt_Barratt_P1_Occ_45',
 'Barratt_Barratt_P2_Occ_5',
 'Barratt_Barratt_P2_Occ_10',
 'Barratt_Barratt_P2_Occ_15',
 'Barratt_Barratt_P2_Occ_20',
 'Barratt_Barratt_P2_Occ_25',
 'Barratt_Barratt_P2_Occ_30',
 'Barratt_Barratt_P2_Occ_35',
 'Barratt_Barratt_P2_Occ_40',
 'Barratt_Barratt_P2_Occ_45',
 'PreInt_Demos_Fam_Ch

In [75]:
train_encoded['Barratt_Barratt_P1_Edu_3'].value_counts()

Barratt_Barratt_P1_Edu_3
0    1208
1       5
Name: count, dtype: int64

In [76]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304 entries, Cfwaf5FX7jWK to dQJXfyRazknD
Columns: 19966 entries, EHQ_EHQ_Total to MRI_Track_Scan_Location_4
dtypes: float64(19918), int64(48)
memory usage: 46.3+ MB


In [77]:
train_encoded.to_parquet('../Dataset/widsdatathon2025/preprocessed_train.parquet')
test_encoded.to_parquet('../Dataset/widsdatathon2025/preprocessed_test.parquet')

# Feature Selection with Lasso (w/out PCA on MRI data)

In [59]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

def run_lasso_multioutput(X: pd.DataFrame, y: pd.DataFrame, alpha_values=None, cv=5):
    """
    Runs Lasso regression for feature selection on high-dimensional multi-output data.

    Parameters:
        X (pd.DataFrame): Feature matrix (19000+ features).
        y (pd.DataFrame): Multi-output target variable (2 columns).
        alpha_values (list, optional): List of alpha values for LassoCV. Default is None (auto-selection).
        cv (int): Number of cross-validation folds (default: 5).

    Returns:
        selected_features (list): List of selected feature names (union across outputs).
        model (MultiOutputRegressor): Trained MultiOutput Lasso model.
    """

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Default alpha values if not provided
    if alpha_values is None:
        alpha_values = np.logspace(-4, 1, 50)  # Log scale for better tuning

    # MultiOutput Lasso with Cross-Validation
    base_lasso = LassoCV(alphas=alpha_values, cv=cv, max_iter=5000, n_jobs=-1)
    lasso_multi = MultiOutputRegressor(base_lasso, n_jobs=-1)  # Runs Lasso for each target column

    # Fit the model
    lasso_multi.fit(X_scaled, y)

    # Extract selected features (union across target variables)
    selected_features = list(X.columns[np.any([estimator.coef_ != 0 for estimator in lasso_multi.estimators_], axis=0)])

    print(f"Selected {len(selected_features)} features out of {X.shape[1]} across both targets.")
    
    return selected_features, lasso_multi


In [85]:
# Separate features and multi-output target variable

X = train_encoded.drop(columns=['ADHD_Outcome', 'Sex_F'])  
y = train_encoded[['ADHD_Outcome', 'Sex_F']]

# Run Lasso Feature Selection for multi-output
selected_features, trained_lasso_multi = run_lasso_multioutput(X, y)

# Display top selected features
print("Top selected features:", selected_features)


TypeError: 'list' object is not callable

In [86]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def run_lasso_tuned(X: pd.DataFrame, y: pd.DataFrame, alpha_values=None, cv=5):
    """
    Runs Lasso regression with alpha tuning using cross-validation.

    Parameters:
        X (pd.DataFrame): Feature matrix (19000+ features).
        y (pd.DataFrame): Multi-output target variable.
        alpha_values (list, optional): List of alpha values for LassoCV. Default is None (auto-selection).
        cv (int): Number of cross-validation folds.

    Returns:
        best_alpha_values (list): Best alpha values for each target variable.
        selected_features (list): List of selected feature names (union across outputs).
        model (MultiOutputRegressor): Trained Lasso model.
    """

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Define alpha range if not provided
    if alpha_values is None:
        alpha_values = np.logspace(-4, 1, 50)  # Log scale for better coverage

    # MultiOutput Lasso with Cross-Validation for Alpha Selection
    base_lasso = LassoCV(alphas=alpha_values, cv=cv, max_iter=5000, n_jobs=-1)
    lasso_multi = MultiOutputRegressor(base_lasso, n_jobs=-1)

    # Train the model
    lasso_multi.fit(X_scaled, y)

    # Extract best alpha values per target
    best_alpha_values = [estimator.alpha_ for estimator in lasso_multi.estimators_]

    # Extract selected features (union across outputs)
    selected_features = list(X.columns[np.any([estimator.coef_ != 0 for estimator in lasso_multi.estimators_], axis=0)])

    print(f"Best alpha values per target: {best_alpha_values}")
    print(f"Selected {len(selected_features)} features out of {X.shape[1]} across all targets.")

    return best_alpha_values, selected_features, lasso_multi


In [87]:
# Separate features and multi-output target variable
X = train.drop(columns=['ADHD_Outcome', 'Sex_F'])  # Replace with actual column names
y = train[['ADHD_Outcome', 'Sex_F']]

# Run Lasso with alpha tuning
best_alphas, selected_features, trained_lasso = run_lasso_tuned(X, y)

# Display top selected features
print("Top selected features:", selected_features)

TypeError: 'list' object is not callable

# Dimension Reduction for MRI Data

In [81]:
mri = pd.read_csv(f"../Dataset/widsdatathon2025/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [82]:
mri = mri.set_index('participant_id')

In [83]:
# ! pip install kneed

In [84]:
from kneed import KneeLocator

def apply_optimal_pca(df: pd.DataFrame, variance_threshold=0.95):
    """
    Performs PCA transformation with the optimal number of components and returns a new DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame with numerical features.
        variance_threshold (float): The variance threshold for PCA selection (default: 95%).

    Returns:
        pca_df (pd.DataFrame): DataFrame with transformed PCA features.
        pca_model (PCA): The trained PCA model.
        optimal_components (int): The optimal number of components.
    """

    # Standardize the data (important for PCA)
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # Fit PCA to determine variance
    pca = PCA().fit(df_scaled)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    # Use KneeLocator to find the elbow point for optimal component selection
    kneedle = KneeLocator(range(1, len(cumulative_variance) + 1), cumulative_variance, curve="concave", direction="increasing")
    optimal_components = kneedle.knee

    if optimal_components is None:
        # Fallback: Use the variance threshold method if elbow detection fails
        optimal_components = np.argmax(cumulative_variance >= variance_threshold) + 1
        print(f"KneeLocator failed, using variance threshold: {optimal_components} components.")

    print(f"Optimal number of PCA components: {optimal_components}")

    # Apply PCA with the selected number of components
    pca_final = PCA(n_components=optimal_components)
    df_pca_transformed = pca_final.fit_transform(df_scaled)

    # Convert transformed data into a DataFrame with original row indices
    pca_df = pd.DataFrame(df_pca_transformed, index=df.index, columns=[f"PC{i+1}" for i in range(optimal_components)])

    return pca_df, pca_final, optimal_components


In [85]:

# Apply PCA and get the reduced DataFrame
pca_transformed_mri, optimal_pcs, pca_model = apply_optimal_pca(mri)

# Display new PCA DataFrame
print(pca_transformed_mri.info())


Optimal number of PCA components: 497
<class 'pandas.core.frame.DataFrame'>
Index: 1213 entries, 70z8Q2xdTXM3 to 9gpepMI9sj5q
Columns: 497 entries, PC1 to PC497
dtypes: float64(497)
memory usage: 4.6+ MB
None


In [86]:
cate = pd.read_excel(f"../Dataset/widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx")
feats = pd.read_excel(f"../Dataset/widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx")

In [88]:
pca_df = cate.merge(feats, on='participant_id', how='left').merge(pca_transformed_mri, on='participant_id', how='left')

In [92]:
solution = pd.read_excel("../Dataset/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
pca_df = pca_df.merge(solution, on='participant_id', how='left')

In [98]:
fill_missing(pca_df, ['MRI_Track_Age_at_Scan'], 'mean') # training set, numeric var
fill_missing(pca_df, ['PreInt_Demos_Fam_Child_Ethnicity'], 'constant', 3) # 3=unknown

In [100]:
check_missing(pca_df)


Features with missing values:


Unnamed: 0,Missing Values,Percentage


In [102]:
pca_df.set_index('participant_id', inplace=True)
pca_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1213 entries, UmrK0vMLopoR to k8HhHnnu2wmt
Columns: 526 entries, Basic_Demos_Enroll_Year to Sex_F
dtypes: float64(500), int64(26)
memory usage: 4.9+ MB


In [103]:
pca_df.to_parquet('../Dataset/widsdatathon2025/preprocessed_train_pca.parquet')