In [None]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras import layers, models
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool, GATConv, NNConv
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.loader import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset
from torch.nn import Sequential, Linear, ReLU
from torchvision import datasets, transforms

In [None]:
# Read in connectomes
test_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
# Read in solutions 
solutions = pd.read_excel('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')

In [None]:
# Check GPU
print(tf.config.list_physical_devices('GPU'))

device = 'mps' if torch.backends.mps.is_available() else 'cpu'

## Graph Construction

In [None]:
def create_graph_construct(df, num_regions=200):
    graph_list = []

    for _, row in df.iterrows():
        participant_id = row['participant_id']
        participant_row = row.values[1:]  # Skip participant ID if present
        adj_matrix = np.zeros((num_regions, num_regions))

        # Fill adjacency matrix (upper triangle only)
        idx = 0
        for i in range(num_regions):
            for j in range(i + 1, num_regions):
                adj_matrix[i, j] = participant_row[idx]
                adj_matrix[j, i] = participant_row[idx]
                idx += 1

        # Extract edge index and weights
        i_idx, j_idx = np.triu_indices(num_regions, k=1)
        edges = np.stack([i_idx, j_idx], axis=1)
        edge_weights = adj_matrix[i_idx, j_idx]


        # Convert to torch tensors
        edge_index = torch.tensor(edges.T, dtype=torch.long)
        edge_attr = torch.tensor(edge_weights, dtype=torch.float).unsqueeze(1)
        x = torch.eye(num_regions, dtype=torch.float)

        graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, num_nodes=num_regions)
        graph_data.participant_id = participant_id 
        # Add to list
        graph_list.append(graph_data)

    return graph_list

In [None]:
# Implement 
train_graph_list = create_graph_construct(train_connectome)

## GCN Autoencoder

In [None]:
# with CGNConv
class ConnectomeNNGNN(nn.Module):

    def __init__(self, num_nodes, hidden_dim):
        super().__init__()
        # Encoder
        # nn for conv1: maps edge_attr to [num_edges, num_nodes * hidden_dim]
        nn1 = Sequential(Linear(1, 128), ReLU(), Linear(128, num_nodes * hidden_dim))
        # nn for conv2: maps edge_attr to [num_edges, hidden_dim * hidden_dim]
        nn2 = Sequential(Linear(1, 128), ReLU(), Linear(128, hidden_dim * hidden_dim))
        self.conv1 = NNConv(num_nodes, hidden_dim, nn1)
        self.conv2 = NNConv(hidden_dim, hidden_dim, nn2)
        
        # Decoder
        self.decoder = Sequential(Linear(2 * hidden_dim, 128), ReLU(), Linear(128, 1))

    def forward(self, x, edge_index, edge_attr, batch):
        # Encoder
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        # Decoder: Predict edge attributes
        row, col = edge_index
        edge_features = torch.cat([x[row], x[col]], dim=-1)  # Concatenate node embeddings
        reconstructed_edge_attr = self.decoder(edge_features)  # [num_edges, 1]
        return reconstructed_edge_attr

    def loss(self, reconstructed_edge_attr, true_edge_attr):
        return F.mse_loss(reconstructed_edge_attr, true_edge_attr)

In [None]:
loader = DataLoader(train_graph_list, batch_size=32, shuffle=True)

In [None]:
# Training setup
num_nodes = 200
hidden_dim = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ConnectomeNNGNN(num_nodes=num_nodes, hidden_dim=hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        reconstructed_edge_attr = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        
        # Compute loss
        loss = model.loss(reconstructed_edge_attr, batch.edge_attr)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * batch.num_graphs
    
    avg_loss = total_loss / len(loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

model.eval()


In [None]:
torch.save(model.state_dict(), "/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/grapg_gnn_ae.pth")

## GAT Autoencoder
Using graph_list

Head: number to attention head

In [None]:
# Preprocess
features = train_connectome.drop(columns=['participant_id']).values
x = torch.tensor(features, dtype=torch.float32)
print("x shape:", x.shape) 

In [None]:
# with GAT
class ConnectomeGNN(torch.nn.Module):
    def __init__(self, num_nodes, hidden_dim, heads=1):
        super().__init__()
        self.gat1 = GATConv(num_nodes, hidden_dim, heads=heads, concat=True)
        self.gat2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, concat=True) 

        self.fc = torch.nn.Linear(hidden_dim, 64) 

    def forward(self, x, edge_index, edge_attr, batch):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        x = global_mean_pool(x, batch) 
        return self.fc(x)

In [None]:
num_nodes = 200 
hidden_dim = 64
model = ConnectomeGNN(num_nodes=num_nodes, hidden_dim=hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
model.eval()
embeddings = []

with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        emb = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch) 
        embeddings.append(emb.cpu())  

# Combine into one tensor
all_embeddings = torch.cat(embeddings, dim=0)  

# Get participant ids 
train_graph_ids = [g.participant_id for g in train_graph_list]

gnn_embeddings = pd.DataFrame(all_embeddings.numpy())
gnn_embeddings['participant_id'] = train_graph_ids


gnn_embeddings_merged = pd.merge(gnn_embeddings, solutions, on='participant_id')

## VAE

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc21 = nn.Linear(input_dim, hidden_dim)
        self.enc22 = nn.Linear(input_dim, hidden_dim)

        # Decoder
        self.dec1 = nn.Linear(latent_dim, hidden_dim)
        self.dec2 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h = F.relu(self.enc1(x))
        mu = self.enc21(h)
        logvar = self.enc22(h)
        return mu, logvar
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        h = F.rellu(self.dec1(z))
        return torch.sigmoid(self.enc2(h))
    
    def forward(self, x, input_dim):
        mu, logvar = self.encode(x.view(-1, input_dim))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def loss_function(recon_x, x, mu, logvar, input_dim):
        BCE = F.binary_cross_entropy(recon_x, x.view(-1, input_dim))
        KLD = 0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + KLD
    
    

In [None]:
import torch.optim as optim

# Initialize model, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Loss {loss.item() / len(data):.6f}')
    print(f'Epoch {epoch}, Avg Loss {train_loss / len(train_loader.dataset):.6f}')

# Run for 10 epochs
for epoch in range(1, 11):
    train(epoch)