In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data

In [2]:

# Read in connectomes
test_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')

# Read in solutions 
solutions = pd.read_excel('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')



In [None]:
# Read in survey data
train_cat_quant = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Preprocessing/train_cat_quant_imputed.csv')
test_cat_quant = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Preprocessing/test_cat_quant_imputed.csv')

In [4]:
train_connectome.head(1)

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221


### Non Graph Autoencoder

In [21]:
# Need to define the NonGraphDAE again to load trained model 

# Define NonGraphDAE class
class NonGraphAE(nn.Module):
    def __init__(self, input_dim=19900, hidden_dim=256, latent_dim=512, dropout=0.2):
        super(NonGraphAE, self).__init__()
        self.dropout = dropout

        # Encoder layers
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc2 = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder layers
        self.dec1 = nn.Linear(latent_dim, hidden_dim) 
        self.dec2 = nn.Linear(hidden_dim, input_dim)

        # Activation functions
        self.relu = nn.ReLU()
    
    def encode(self, x):
        x = self.relu(self.enc1(x))
        x = self.enc2(x)
        return x
    
    def decode(self, z):
        x = self.relu(self.dec1(z))
        x = self.dec2(x)
        return x
    
    def forward(self, x):
        z = self.encode(x)
        recon_x = self.decode(z)
        return recon_x, z

In [22]:
# Connectome data 
ids = train_connectome['participant_id']
connectome_features = train_connectome.iloc[:, 1:].values

# Convert to tensor
X = torch.tensor(connectome_features, dtype=torch.float32)

# Load autoencoder
autoencoder = torch.load('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/nongraph_autoencoder.pth', weights_only=False)

# Set to evaluation mode
autoencoder.eval()

# Get latent reps
with torch.no_grad():
    _, z = autoencoder(X)  # z is the latent embeddings
    encoded_features = z.numpy()

# Save encoded features with IDs
encoded_df = pd.DataFrame(encoded_features, columns=[f"latent_{i}" for i in range(encoded_features.shape[1])])
encoded_df["participant_id"] = ids
encoded_df = encoded_df[['participant_id'] + [col for col in encoded_df.columns if col != 'participant_id']]


In [23]:
encoded_df.head()

Unnamed: 0,participant_id,latent_0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,...,latent_502,latent_503,latent_504,latent_505,latent_506,latent_507,latent_508,latent_509,latent_510,latent_511
0,70z8Q2xdTXM3,0.410976,-0.501845,-0.364146,-0.491903,0.393063,0.252061,-1.805321,-0.230074,0.169664,...,-0.184198,0.609233,1.180916,-0.879422,-0.972005,-1.547123,-0.123295,-0.290636,0.592719,-1.227824
1,WHWymJu6zNZi,0.222915,-0.410652,-0.118329,-0.074499,0.187898,-0.457596,-0.914698,0.064583,-0.498138,...,-0.553056,-0.078315,0.460481,0.364299,0.431604,-0.076219,0.672366,0.059622,-0.239692,-0.398499
2,4PAQp1M6EyAo,0.746281,0.570709,-0.108126,0.026869,-0.682994,-0.268827,-1.313621,-0.651664,-0.1163,...,0.044046,0.331892,-0.121569,0.188966,1.127963,-0.26061,1.400629,1.148817,0.307426,-0.802827
3,obEacy4Of68I,-0.103012,-0.437822,0.132941,-0.06293,-0.225961,-0.258818,0.495031,-0.261689,1.457125,...,-0.296552,0.191983,0.263804,-0.448022,0.006136,-1.076978,0.745718,-0.082977,1.639096,-0.550482
4,s7WzzDcmDOhF,0.727864,-0.362274,-0.416239,0.214545,0.544973,0.524552,-2.167723,-0.185471,0.059395,...,0.206027,0.800041,-0.495047,1.444625,-1.412053,-0.399675,0.357171,0.081166,-1.454923,0.532057


In [20]:
encoded_df_merged = encoded_df.merge(solutions, on='participant_id', how='left')
encoded_df_cat_quant_merged = encoded_df_merged.merge(train_cat_quant, on='participant_id', how='left')

In [None]:
encoded_df_cat_quant_merged.to_csv('merged_ae_encoded_cat_quant.csv')

In [39]:
solutions['outcome'] = np.select(
    [
        (solutions['ADHD_Outcome'] == 1) & (solutions['Sex_F'] == 1),  # ADHD and female
        (solutions['ADHD_Outcome'] == 0) & (solutions['Sex_F'] == 1),  # No ADHD and female
        (solutions['ADHD_Outcome'] == 1) & (solutions['Sex_F'] == 0),  # ADHD and male
        (solutions['ADHD_Outcome'] == 0) & (solutions['Sex_F'] == 0),  # No ADHD and male
    ],
    ['adhd_f', 'noadhd_f', 'adhd_m', 'noadhd_m'], 
    default=np.nan  
)

### Denoising Non Graph Autoencoder

In [15]:
class NonGraphDAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout=0.2):
        
        super(NonGraphDAE, self).__init__()
        self.dropout = dropout

        # Encoder layer: learns node embeddings, compressed node into latent_dim
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc2 = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder layer: Fully connected layers to predict edge weights between node pairs
        self.dec1 = nn.Linear(latent_dim, hidden_dim) 
        self.dec2 = nn.Linear(hidden_dim, input_dim)

        # Activation functions
        self.relu = nn.ReLU() # non-linearity

    def add_noise(self, x, noise_factor=0.05):
        
        noisy_x = x.clone() # Clone to not modify original
        mask = torch.rand(x.size(), device = x.device) > self.dropout # Randomly drop edge weights
        noise = torch.randn_like(x) * noise_factor
        noisy_x[mask] += noise[mask]

        return noisy_x
    
    # Encoder
    def encode(self, x):

        x = self.relu(self.enc1(x))# Encode the graph to latent node embeddings using GCN layers
        x = self.enc2(x)
        return x
    
    # Decoder
    def decode(self, z):

        x = self.relu(self.dec1(z))
        x = self.dec2(x)
        return x
    
    # Combines all steps: noise, encode, and decode
    # Returns reconstructed edge weights nad z a tensor object of latent node embeddings
    def forward(self, x):

        # Add noise 
        noisy_x = self.add_noise(x) if self.training else x
        # Encode noisy graph to latent embeddings
        z = self.encode(noisy_x)
        # Decode to reconstruct clean edge weights
        recon_x = self.decode(z)

        return recon_x, z

In [16]:
# Connectome data 
ids = train_connectome['participant_id']
connectome_features = train_connectome.iloc[:, 1:].values

# Convert to tensor
X = torch.tensor(connectome_features, dtype=torch.float32)

# Load autoencoder
autoencoder = torch.load('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/nongraph_dae.pth', map_location=torch.device('cpu'), weights_only=False)

# Set to evaluation mode
autoencoder.eval()

# Get latent reps
with torch.no_grad():
    _, z = autoencoder(X)  # z is the latent embeddings
    encoded_features = z.numpy()

# Save encoded features with IDs
encoded_df = pd.DataFrame(encoded_features, columns=[f"latent_{i}" for i in range(encoded_features.shape[1])])
encoded_df["participant_id"] = ids
encoded_df = encoded_df[['participant_id'] + [col for col in encoded_df.columns if col != 'participant_id']]

AttributeError: 'collections.OrderedDict' object has no attribute 'eval'