In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data

In [3]:

# Read in connectomes
test_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')

# Read in solutions 
solutions = pd.read_excel('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')



In [4]:
# Read in survey data
train_cat_quant = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Preprocessing/train_cat_quant_imputed.csv')
test_cat_quant = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Preprocessing/test_cat_quant_imputed.csv')

In [5]:
train_connectome.head(1)

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221


In [8]:
# Need to define the NonGraphDAE again to load trained model 

# Define NonGraphDAE class
class NonGraphAE(nn.Module):
    def __init__(self, input_dim=19900, hidden_dim=512, latent_dim=128, dropout=0.2):
        super(NonGraphDAE, self).__init__()
        self.dropout = dropout

        # Encoder layers
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc2 = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder layers
        self.dec1 = nn.Linear(latent_dim, hidden_dim) 
        self.dec2 = nn.Linear(hidden_dim, input_dim)

        # Activation functions
        self.relu = nn.ReLU()
    
    def encode(self, x):
        x = self.relu(self.enc1(x))
        x = self.enc2(x)
        return x
    
    def decode(self, z):
        x = self.relu(self.dec1(z))
        x = self.dec2(x)
        return x
    
    def forward(self, x):
        z = self.encode(x)
        recon_x = self.decode(z)
        return recon_x, z

In [9]:
# Connectome data 
ids = train_connectome['participant_id']
connectome_features = train_connectome.iloc[:, 1:].values

# Convert to tensor
X = torch.tensor(connectome_features, dtype=torch.float32)

# Load autoencoder
autoencoder = torch.load('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/nongraph_autoencoder.pth', weights_only=False)

# Set to evaluation mode
autoencoder.eval()

# Get latent reps
with torch.no_grad():
    _, z = autoencoder(X)  # z is the latent embeddings
    encoded_features = z.numpy()

# Save encoded features with IDs
encoded_df = pd.DataFrame(encoded_features, columns=[f"latent_{i}" for i in range(encoded_features.shape[1])])
encoded_df["participant_id"] = ids
encoded_df = encoded_df[['participant_id'] + [col for col in encoded_df.columns if col != 'participant_id']]


In [10]:
encoded_df.head()

Unnamed: 0,participant_id,latent_0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,...,latent_118,latent_119,latent_120,latent_121,latent_122,latent_123,latent_124,latent_125,latent_126,latent_127
0,70z8Q2xdTXM3,-0.062868,0.349555,0.269102,-1.03725,-0.371457,2.687042,-0.938559,0.343819,1.529028,...,0.898878,-0.505424,-2.800898,2.112762,0.072746,0.112833,1.101317,-0.274609,1.780691,-1.200128
1,WHWymJu6zNZi,0.873152,-1.348689,-0.075249,1.310259,0.85024,1.172884,0.66062,-0.151064,-0.515864,...,1.000158,-1.448996,-4.479689,1.553875,-0.383005,-0.554659,-0.591001,0.558051,-0.607866,-0.977393
2,4PAQp1M6EyAo,-0.687261,-1.126725,0.717308,-0.064086,0.543015,1.060938,1.972849,-2.691068,-1.914503,...,-0.083528,-1.745576,-4.895132,1.453139,0.981051,-2.081374,-1.202411,1.856776,-0.835133,-0.634262
3,obEacy4Of68I,-1.045028,-1.901052,-0.776158,1.970024,-0.871627,2.87939,1.910207,-0.794189,0.619428,...,-0.938341,-1.881618,-1.757071,-0.234614,-1.012772,-1.371495,1.433907,1.109286,-1.380279,0.399602
4,s7WzzDcmDOhF,-1.022789,-0.238109,1.401311,0.484008,0.408498,3.093665,2.066896,-2.035163,-1.798792,...,0.893334,-2.721919,-3.318735,0.657356,-1.185613,-1.80262,-0.060335,0.008029,-1.619095,0.382254


In [11]:
encoded_df_merged = encoded_df.merge(solutions, on='participant_id', how='left')

In [12]:
encoded_df_merged.to_csv('merged_encoded_cat_quant.csv')

In [39]:
solutions['outcome'] = np.select(
    [
        (solutions['ADHD_Outcome'] == 1) & (solutions['Sex_F'] == 1),  # ADHD and female
        (solutions['ADHD_Outcome'] == 0) & (solutions['Sex_F'] == 1),  # No ADHD and female
        (solutions['ADHD_Outcome'] == 1) & (solutions['Sex_F'] == 0),  # ADHD and male
        (solutions['ADHD_Outcome'] == 0) & (solutions['Sex_F'] == 0),  # No ADHD and male
    ],
    ['adhd_f', 'noadhd_f', 'adhd_m', 'noadhd_m'], 
    default=np.nan  
)