In [12]:
# Data Cleaning and Concatenation
# -------------------------------
import pandas as pd

# Load the provided Excel files
categorical_metadata_path = 'widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx'
quantitative_metadata_path = 'widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx'
functional_path = 'widsdatathon2025/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv'
solutions_path = 'widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx'

categorical_metadata = pd.read_excel(categorical_metadata_path)
quantitative_metadata = pd.read_excel(quantitative_metadata_path)
function_data = pd.read_csv(functional_path)
solutions = pd.read_excel(solutions_path)

# Check the first few rows of each to inspect their structure
categorical_metadata.head(), quantitative_metadata.head(), function_data.head(), solutions.head()

# Merge the dataframes by 'participant_id'
merged_data = pd.merge(categorical_metadata, quantitative_metadata, on='participant_id', how='outer')
merged_data = pd.merge(merged_data, function_data, on='participant_id', how='outer')
merged_data = pd.merge(merged_data, solutions, on='participant_id', how='outer')

merged_data.head()
merged_data.to_csv('merged_data.csv', index=False)
merged_data.shape




(1213, 19930)

I was SUPER lazy and just deleted any columns with any NAs, a later problem if you ask me

In [15]:
# Getting rid of NaN values
# -------------------------

# Load in the megred data
merged_data = pd.read_csv('merged_data.csv')

# Check for NaN values in the merged data
merged_data.isnull().sum()

#List the columns with NaN values
columns_with_nan = merged_data.columns[merged_data.isna().any()].tolist()
columns_with_nan

# Drop columns with NaN values
merged_data = merged_data.dropna(axis=1)

# Check the dimensions of the data
merged_data.shape

# Convert the merged data to a CSV file
merged_data.to_csv('merged_data_cleaned.csv', index=False)



In [16]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the merged CSV file (assuming it's already merged)
merged_file_path = 'merged_data_cleaned.csv'  # Path to your merged file
merged_data = pd.read_csv(merged_file_path)

# Select features and target columns
target_columns = ['Sex_F', 'ADHD_Outcome']
feature_columns = [col for col in merged_data.columns if col not in target_columns and col != 'participant_id']

# Features and targets
X = merged_data[feature_columns]
y = merged_data[target_columns]

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((970, 19925), (243, 19925), (970, 2), (243, 2))

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

# --- Custom Dataset for Multi-Task Learning ---
class MultiTaskDataset(Dataset):
    def __init__(self, df, target_cols):
        self.y = df[target_cols].values.astype(np.float32)
        self.X = df.drop(columns=target_cols + ['participant_id']).values.astype(np.float32)  
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]  # y is a vector of targets

# --- Custom Weighted Loss Function ---
class WeightedBCELoss(nn.Module):
    def __init__(self):
        super(WeightedBCELoss, self).__init__()

    def forward(self, preds, target, sex, adhd):
        # Assign a weight of 2 for female ADHD cases (Sex_F=1, ADHD_Outcome=1)
        weights = torch.ones_like(target[:, 0])  # Default weight is 1, shape (batch_size,)
        weights[(sex == 1) & (adhd == 1)] = 2  # Apply weight of 2 for female ADHD cases
        
        # Compute the loss for each task
        loss_sex = nn.BCELoss(reduction='none')(preds[:, 0], target[:, 0])  # Loss for Sex_F
        loss_adhd = nn.BCELoss(reduction='none')(preds[:, 1], target[:, 1])  # Loss for ADHD_Outcome
        
        # Apply the weights to each task's loss
        weighted_loss_sex = loss_sex * weights  # Apply weight for Sex_F loss
        weighted_loss_adhd = loss_adhd * weights  # Apply weight for ADHD_Outcome loss

        return (weighted_loss_sex.mean() + weighted_loss_adhd.mean()) / 2

# --- Multi-Task Neural Network ---
class MultiTaskNN(nn.Module):
    def __init__(self, input_dim, dropout_rate=0.3):
        super(MultiTaskNN, self).__init__()
        # Shared layers with dropout
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(dropout_rate),  # Dropout added here
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout_rate)  # Dropout added here
        )
        # Separate heads for each task
        self.sex_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        self.adhd_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        shared_rep = self.shared(x)
        sex_pred = self.sex_head(shared_rep)
        adhd_pred = self.adhd_head(shared_rep)
        return torch.cat([sex_pred, adhd_pred], dim=1)  # output size [batch_size, 2]

# --- F1 Score Calculation with Weights ---
def weighted_f1_score(preds, target, sex, adhd):
    weights = np.ones_like(target[:, 0])  # Default weight is 1 for both tasks
    weights[(sex == 1) & (adhd == 1)] = 2  # Apply weight of 2 for female ADHD cases
    
    # Calculate F1 score for both tasks
    f1_sex = f1_score(target[:, 0], (preds[:, 0] > 0.5).astype(int), sample_weight=weights)
    f1_adhd = f1_score(target[:, 1], (preds[:, 1] > 0.5).astype(int), sample_weight=weights)
    
    return (f1_sex + f1_adhd) / 2

# --- Training Loop ---
def train_model(df, target_cols, num_epochs=100, batch_size=64, learning_rate=0.0001, weight_decay=0.01, dropout_rate=0.2):
    # Set device (this should be defined before using it)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Split the data into training and validation sets (80-20 split)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Create datasets and dataloaders
    train_dataset = MultiTaskDataset(train_df, target_cols)
    val_dataset = MultiTaskDataset(val_df, target_cols)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize the model
    input_dim = train_df.drop(columns=target_cols + ['participant_id']).shape[1]
    model = MultiTaskNN(input_dim, dropout_rate).to(device)
    
    # Initialize custom weighted loss
    criterion = WeightedBCELoss()

    # Define optimizer with weight decay (L2 regularization)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    best_val_f1 = 0  # For tracking best F1 score
    model_weights_per_epoch = []  # Store model weights at each epoch
    val_f1_scores = []  # Store F1 scores for validation

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            sex = y_batch[:, 0]  # Extract Sex_F values
            adhd = y_batch[:, 1]  # Extract ADHD_Outcome values
            
            optimizer.zero_grad()
            preds = model(X_batch)
            
            # Calculate weighted loss
            loss = criterion(preds, y_batch, sex, adhd)
            
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        
        # --- Validation ---
        model.eval()
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                outputs = model(X_batch)
                all_preds.append(outputs.cpu().numpy())
                all_targets.append(y_batch.cpu().numpy())
        
        all_preds = np.concatenate(all_preds, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
        
        # Calculate weighted F1 score for both tasks
        f1_score_avg = weighted_f1_score(all_preds, all_targets, all_targets[:, 0], all_targets[:, 1])
        
        print(f"Epoch {epoch+1}/{num_epochs} - Val Weighted F1 Score: {f1_score_avg:.4f}")
        
        # Store model weights and validation F1 score
        model_weights_per_epoch.append(model.state_dict())  # Save weights
        val_f1_scores.append(f1_score_avg)  # Save validation score

    # After training, find the epoch with the best validation F1 score
    best_epoch = np.argmax(val_f1_scores)  # Get the index of the best F1 score
    print(f"Best epoch: {best_epoch + 1} with F1 Score: {val_f1_scores[best_epoch]:.4f}")
    
    # Reload the best model weights
    model.load_state_dict(model_weights_per_epoch[best_epoch])  # Load best model weights
    return model






In [None]:
# Import necessary libraries
import pandas as pd

# Load the cleaned and merged dataset
merged_data_cleaned = pd.read_csv('merged_data_cleaned.csv') 

# Define the target columns
target_cols = ['Sex_F', 'ADHD_Outcome']  # Specify target columns


# Train the model using the train_model function
trained_model = train_model(merged_data_cleaned, target_cols, num_epochs=100, batch_size=64, learning_rate=0.00005, weight_decay=0.03, dropout_rate=0.3)




Epoch 1/100 - Val Weighted F1 Score: 0.4150
Epoch 2/100 - Val Weighted F1 Score: 0.6516
Epoch 3/100 - Val Weighted F1 Score: 0.6507
Epoch 4/100 - Val Weighted F1 Score: 0.6975
Epoch 5/100 - Val Weighted F1 Score: 0.7101
Epoch 6/100 - Val Weighted F1 Score: 0.7152
Epoch 7/100 - Val Weighted F1 Score: 0.7189
Epoch 8/100 - Val Weighted F1 Score: 0.7140
Epoch 9/100 - Val Weighted F1 Score: 0.7227
Epoch 10/100 - Val Weighted F1 Score: 0.7075
Epoch 11/100 - Val Weighted F1 Score: 0.4150
Epoch 12/100 - Val Weighted F1 Score: 0.6275
Epoch 13/100 - Val Weighted F1 Score: 0.7265
Epoch 14/100 - Val Weighted F1 Score: 0.7150
Epoch 15/100 - Val Weighted F1 Score: 0.4240
Epoch 16/100 - Val Weighted F1 Score: 0.6233
Epoch 17/100 - Val Weighted F1 Score: 0.7247
Epoch 18/100 - Val Weighted F1 Score: 0.1792
Epoch 19/100 - Val Weighted F1 Score: 0.4333
Epoch 20/100 - Val Weighted F1 Score: 0.2641
Epoch 21/100 - Val Weighted F1 Score: 0.3013
Epoch 22/100 - Val Weighted F1 Score: 0.2400
Epoch 23/100 - Val 

Realised that I forgot to scale or standardise the data in any way at all the first run, so here is attempt 2. It is better! Marginally

In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load the fMRI data (replace with your correct path)
fmri_data_path = 'widsdatathon2025/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv'  # Replace with actual file path
fmri_data = pd.read_csv(fmri_data_path)

# Step 2: Separate the participant ID column and the fMRI data columns
participant_id_fmri = fmri_data.iloc[:, 0]  # Assuming the first column is participant ID
fmri_data_columns = fmri_data.iloc[:, 1:]  # All columns except the first (fMRI data)

# Step 3: Apply StandardScaler to the fMRI data
scaler = StandardScaler()
scaled_fmri_data = scaler.fit_transform(fmri_data_columns)

# Step 4: Convert the scaled data back to a DataFrame (optional, for easier handling)
scaled_fmri_data_df = pd.DataFrame(scaled_fmri_data, columns=fmri_data_columns.columns)

# Step 5: Concatenate the participant ID back with the scaled fMRI data
final_fmri_data = pd.concat([participant_id_fmri, scaled_fmri_data_df], axis=1)

# Step 6: Load the quantitative metadata (replace with the correct file path)
quant_data_path = 'widsdatathon2025/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx'  # Replace with actual file path
quant_data = pd.read_excel(quant_data_path)

# Step 7: Select only the necessary columns (keep all quantitative data columns)
selected_columns_quant = [
    'participant_id', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID',
    'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
    'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
    'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan'
]
quant_data_selected = quant_data[selected_columns_quant]

# Step 8: Drop columns with NA values (if any)
quant_data_cleaned = quant_data_selected.dropna(axis=1, how='any')  # Drop columns with any NA values

# Step 9: Apply Standard Scaling to the continuous data (excluding participant_id)
continuous_cols_quant = quant_data_cleaned.columns[1:]  # Exclude participant_id from scaling
scaler_quant = StandardScaler()
quant_data_cleaned[continuous_cols_quant] = scaler_quant.fit_transform(quant_data_cleaned[continuous_cols_quant])

# Step 10: Load the categorical metadata (replace with the correct file path)
categorical_data_path = 'widsdatathon2025/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx'  # Replace with actual file path
categorical_data = pd.read_excel(categorical_data_path)

# Step 11: Select the necessary columns for one-hot encoding
selected_columns_categorical = [
    'participant_id', 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity',
    'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
    'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ'
]
categorical_data_selected = categorical_data[selected_columns_categorical]

# Step 12: Drop columns with NA values (if any)
categorical_data_cleaned = categorical_data_selected.dropna(axis=1, how='any')  # Drop columns with any NA values

# Step 13: One-hot encode categorical columns (excluding 'participant_id')
categorical_columns = categorical_data_cleaned.columns[1:]  # Exclude 'participant_id'
categorical_data_encoded = pd.get_dummies(categorical_data_cleaned, columns=categorical_columns)

# Step 14: Concatenate the cleaned fMRI data, the cleaned quantitative data, and the encoded categorical data
full_data = pd.merge(final_fmri_data, quant_data_cleaned, on='participant_id', how='inner')
full_data = pd.merge(full_data, categorical_data_encoded, on='participant_id', how='inner')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quant_data_cleaned[continuous_cols_quant] = scaler_quant.fit_transform(quant_data_cleaned[continuous_cols_quant])


In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Ensure the target data is loaded
target_file_path = 'widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx'  # Replace with the actual path to the target data
target_data = pd.read_excel(target_file_path)

# Step 2: Merge the feature data (already in full_data) with the target data based on participant_id
data = pd.merge(full_data, target_data, on='participant_id', how='inner')

# Step 3: Specify the target columns
target_cols = ['ADHD_Outcome', 'Sex_F']

# Step 4: Run the model training
trained_model = train_model(data, target_cols, num_epochs=100, batch_size=64, learning_rate=0.0001, weight_decay=0.01, dropout_rate=0.2)


Epoch 1/100 - Val Weighted F1 Score: 0.7441
Epoch 2/100 - Val Weighted F1 Score: 0.7276
Epoch 3/100 - Val Weighted F1 Score: 0.7445
Epoch 4/100 - Val Weighted F1 Score: 0.7474
Epoch 5/100 - Val Weighted F1 Score: 0.7548
Epoch 6/100 - Val Weighted F1 Score: 0.7606
Epoch 7/100 - Val Weighted F1 Score: 0.7609
Epoch 8/100 - Val Weighted F1 Score: 0.7668
Epoch 9/100 - Val Weighted F1 Score: 0.7650
Epoch 10/100 - Val Weighted F1 Score: 0.7579
Epoch 11/100 - Val Weighted F1 Score: 0.7680
Epoch 12/100 - Val Weighted F1 Score: 0.7667
Epoch 13/100 - Val Weighted F1 Score: 0.7667
Epoch 14/100 - Val Weighted F1 Score: 0.7666
Epoch 15/100 - Val Weighted F1 Score: 0.7692
Epoch 16/100 - Val Weighted F1 Score: 0.7744
Epoch 17/100 - Val Weighted F1 Score: 0.7688
Epoch 18/100 - Val Weighted F1 Score: 0.7633
Epoch 19/100 - Val Weighted F1 Score: 0.7771
Epoch 20/100 - Val Weighted F1 Score: 0.7820
Epoch 21/100 - Val Weighted F1 Score: 0.7807
Epoch 22/100 - Val Weighted F1 Score: 0.7455
Epoch 23/100 - Val 

Just for my sanity's sake, I tested it against a logistic regression (took like 5 mins with my lab partner, ChatGPT)
It's not better, thank god - xoxo Niamh

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# Step 1: Merge target variables with feature data (assuming full_data is already prepared and includes both features and participant_id)
target_file_path = 'widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx'  # Replace with the actual path to the target data
target_data = pd.read_excel(target_file_path)

# Merge the target data with the full dataset
data_with_targets = pd.merge(full_data, target_data, on='participant_id', how='inner')

# Step 2: Specify the target columns
target_cols = ['ADHD_Outcome', 'Sex_F']

# Step 3: Split the data into features (X) and targets (y)
X = data_with_targets.drop(columns=['participant_id'] + target_cols)  # Features
y = data_with_targets[target_cols]  # Targets

# Step 4: Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Train Logistic Regression models for both target variables
model_adhd = LogisticRegression(max_iter=1000)
model_sex = LogisticRegression(max_iter=1000)

# Fit the models for ADHD_Outcome and Sex_F
model_adhd.fit(X_train_scaled, y_train['ADHD_Outcome'])
model_sex.fit(X_train_scaled, y_train['Sex_F'])

# Step 7: Make predictions on the test set
y_pred_adhd = model_adhd.predict(X_test_scaled)
y_pred_sex = model_sex.predict(X_test_scaled)

# Step 8: Calculate the F1 score for both targets with weighted averaging
# Apply weight of 2 for Female ADHD cases (ADHD_Outcome=1, Sex_F=1), else weight of 1

# Calculate weights for ADHD_Outcome
weights_adhd = (y_test['Sex_F'] == 1) & (y_test['ADHD_Outcome'] == 1)  # Weight of 2 for female ADHD cases
f1_adhd = f1_score(y_test['ADHD_Outcome'], y_pred_adhd, average='weighted', sample_weight=weights_adhd)

# Calculate weights for Sex_F
weights_sex = (y_test['Sex_F'] == 1) & (y_test['ADHD_Outcome'] == 1)  # Weight of 2 for female ADHD cases
f1_sex = f1_score(y_test['Sex_F'], y_pred_sex, average='weighted', sample_weight=weights_sex)

# Step 9: Average the F1 scores for both tasks
final_f1_score = (f1_adhd + f1_sex) / 2

# Step 10: Output the F1 scores
print(f"F1 Score for ADHD_Outcome: {f1_adhd:.4f}")
print(f"F1 Score for Sex_F: {f1_sex:.4f}")
print(f"Final Averaged F1 Score: {final_f1_score:.4f}")


F1 Score for ADHD_Outcome: 0.9725
F1 Score for Sex_F: 0.4865
Final Averaged F1 Score: 0.7295
