In [None]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras import layers, models
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool, GATConv, NNConv
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.loader import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset
from torch.nn import Sequential, Linear, ReLU




In [2]:
# Read in connectomes
test_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_connectome = pd.read_csv('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')

In [3]:
# Read in solutions 
solutions = pd.read_excel('/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')

In [4]:
solutions.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [None]:
# Check GPU
print(tf.config.list_physical_devices('GPU'))

device = 'mps' if torch.backends.mps.is_available() else 'cpu'



[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Denoising Autoencoder

Training the DAE using the training data only

Construct a graph for each patient

Layer options

- Graph convolution network GCN layers

- Graph attention network

- GraphSAGE

### Graph Construct

In [29]:
def create_graph_construct(df, num_regions=200):
    graph_list = []

    for _, row in df.iterrows():
        participant_id = row['participant_id']
        participant_row = row.values[1:]  # Skip participant ID if present
        adj_matrix = np.zeros((num_regions, num_regions))

        # Fill adjacency matrix (upper triangle only)
        idx = 0
        for i in range(num_regions):
            for j in range(i + 1, num_regions):
                adj_matrix[i, j] = participant_row[idx]
                adj_matrix[j, i] = participant_row[idx]
                idx += 1

        # Extract edge index and weights
        i_idx, j_idx = np.triu_indices(num_regions, k=1)
        edges = np.stack([i_idx, j_idx], axis=1)
        edge_weights = adj_matrix[i_idx, j_idx]


        # Convert to torch tensors
        edge_index = torch.tensor(edges.T, dtype=torch.long)
        edge_attr = torch.tensor(edge_weights, dtype=torch.float)
        x = torch.eye(num_regions, dtype=torch.float)

        graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, num_nodes=num_regions)
        graph_data.participant_id = participant_id 
        # Add to list
        graph_list.append(graph_data)

    return graph_list


In [30]:
# Implement 
train_graph_list = create_graph_construct(train_connectome)

In [None]:
# Print stats
sample_graph = train_graph_list[0]
print(sample_graph)

In [None]:
# A and B is the same as B and A
def is_undirected(graph):
    edge_index = graph.edge_index.numpy().T  # Shape: (num_edges, 2)
    edge_set = set(map(tuple, edge_index))

    for u, v in edge_set:
        if (v, u) not in edge_set:
            return False
    return True

# Check one graph
print(is_undirected(train_graph_list[0]))

# check all graphs
all_undirected = all(is_undirected(g) for g in train_graph_list)
print(all_undirected)

In [None]:
# Convert to NetworkX (no weights)
nx_graph = to_networkx(sample_graph, to_undirected=True)

# Draw the graph
plt.figure(figsize=(20, 20))
nx.draw(nx_graph, with_labels=True, node_size=800, font_size=12)
plt.title("Sample Graph Connectivity")
plt.show()

plt.show()



In [None]:
# Add edge weights manually from edge_attr
edge_weights = sample_graph.edge_attr.numpy()
edges = sample_graph.edge_index.numpy().T

# Convert to NetworkX (with weights)
G = nx.Graph()
for (i, j), w in zip(edges, edge_weights):
    G.add_edge(i, j, weight=w)

plt.figure(figsize=(20, 20))
# Draw with edge labels (weights)
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_size=300, font_size=10)
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
plt.title("Graph with Edge Weights")
plt.show()


In [None]:
num_edges = []
mean_weights = []
max_weights = []
min_weights = []

for graph in train_graph_list:
    num_edges.append(graph.edge_index.shape[1])
    mean_weights.append(graph.edge_attr.mean().item())
    max_weights.append(graph.edge_attr.max().item())
    min_weights.append(graph.edge_attr.min().item())

print("Avg number of edges:", np.mean(num_edges))
print("Mean weight:", np.mean(mean_weights))
print("Max weight:", np.max(max_weights))
print("Min weight:", np.min(min_weights))

### DAE construct

Architecture
- Encoder
- Decoder
- Noise mechanism 
- Loss Function

Components
- GCN layer
- Input Feature - x = torch.eye(200) 
- Output target - Node features (x)
    - DAE reconstructs enhanced node features
    - GNN compatible
    - captures both local structure and noise-robust patterns
    - compact

Noise mechnaisms:
- masking edge weights
- adding Gaussian noise to edge weights
- Perturb X

## Graph GCN Denoising Autoencoder

In [36]:
class GraphDAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout=0.2):

        super(GraphDAE, self).__init__()
        self.dropout = dropout

        # Encoder layer: learns node embeddings, compressed node into latent_dim
        self.conv1 = GCNConv(input_dim,hidden_dim)
        self.conv_intermediate = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, latent_dim)


        # Decoder layer: Fully connected layers to predict edge weights between node pairs
        self.fullc1 = nn.Linear(latent_dim * 2, hidden_dim) 
        self.fullc2 = nn.Linear(hidden_dim, 1)

        # Activation functions
        self.relu = nn.ReLU() # non-linearity
        self.sigmoid = nn.Sigmoid() # weights are [0,1]

    def add_noise(self, edge_index, edge_attr, noise_factor=0.05):
        
        mask = torch.rand(edge_attr.size()) > self.dropout # Randomly drop edge weights
        noisy_edge_attr = edge_attr.clone() # Clone to not modify original

        # Using Gaussian noise to non-dropped edges * by noise factor
        noisy_edge_attr[mask] += torch.randn(sum(mask)).to(edge_attr.device) * noise_factor # randomly generates a value and saves it to same device original tensor is stored
        return edge_index, noisy_edge_attr
    
    # Encoder
    def encode(self, x, edge_index, edge_attr):

        x = self.relu(self.conv1(x, edge_index))
        x = self.relu(self.conv_intermediate(x, edge_index)) # Encode the graph to latent node embeddings using GCN layers
        x = self.conv2(x, edge_index)
        return x
    
    # Decoder
    def decode(self, z, edge_index):

        # concatenate embeddings z[i] and z[j] represents each node
        edge_features = torch.cat([z[edge_index[0]], z[edge_index[1]]], dim=-1)

        x = self.relu(self.fullc1(edge_features))

        x = self.fullc2(x).squeeze()
        return self.sigmoid(x)
    
    # Combines all steps: noise, encode, and decode
    # Returns reconstructed edge weights nad z a tensor object of latent node embeddings
    def forward(self, data):

        x,edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        # Normalize to [0,1]
        edge_attr = (edge_attr - edge_attr.min()) / (edge_attr.max() - edge_attr.min() + 1e-6)
        # Add noise 
        noisy_edge_index, noisy_edge_attr = self.add_noise(edge_index, edge_attr)
        # Encode noisy graph to latent embeddings
        z = self.encode(x, noisy_edge_index, noisy_edge_attr)
        # Decode to reconstruct clean edge weights
        recon_edge_attr = self.decode(z, edge_index)

        return recon_edge_attr, z


In [None]:
# Train the decoder

# Parameters
input_dim = 200 # number of nodes
hidden_dim = 128 # number of neurons in nn
latent_dim = 64 # latent embedding size per ned
batch_size = 32
epochs = 50

model = GraphDAE(input_dim, hidden_dim, latent_dim).to(device) # move mode to GPU
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

# optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss() # metric

graph_list = create_graph_construct(train_connectome, num_regions=200)
loader = DataLoader(graph_list, batch_size=batch_size, shuffle=True) # Organizes graph_list into batches, since our graph_list is a list of Data objects, training on one at a time is slow
# and all at once is too much

# Train loop for batches
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()

        # Forward
        recon_edge_attr, z = model(batch)

        loss = criterion(recon_edge_attr, batch.edge_attr)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
avg_loss = total_loss / len(loader)

print(f"Epoch {epoch+1}, Loss: {total_loss / len(loader)}")
print(f"Epoch {epoch+1}, Loss: {avg_loss}")
    # Early stopping if loss plateaus
if avg_loss < 0.005:  # Target threshold
    print("Stopping early")
    break



In [41]:
torch.save(model.state_dict(), "/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/graph_dae.pth")

# Non Graph Denoising Autoencoder


In [None]:
class NonGraphDAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout=0.2):
        
        super(NonGraphDAE, self).__init__()
        self.dropout = dropout

        # Encoder layer: learns node embeddings, compressed node into latent_dim
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc2 = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder layer: Fully connected layers to predict edge weights between node pairs
        self.dec1 = nn.Linear(latent_dim, hidden_dim) 
        self.dec2 = nn.Linear(hidden_dim, input_dim)

        # Activation functions
        self.relu = nn.ReLU() # non-linearity

    def add_noise(self, x, noise_factor=0.05):
        
        noisy_x = x.clone() # Clone to not modify original
        mask = torch.rand(x.size(), device = x.device) > self.dropout # Randomly drop edge weights
        noise = torch.randn_like(x) * noise_factor
        noisy_x[mask] += noise[mask]

        return noisy_x
    
    # Encoder
    def encode(self, x):

        x = self.relu(self.enc1(x))# Encode the graph to latent node embeddings using GCN layers
        x = self.enc2(x)
        return x
    
    # Decoder
    def decode(self, z):

        x = self.relu(self.dec1(z))
        x = self.dec2(x)
        return x
    
    # Combines all steps: noise, encode, and decode
    # Returns reconstructed edge weights nad z a tensor object of latent node embeddings
    def forward(self, x):

        # Add noise 
        noisy_x = self.add_noise(x)
        # Encode noisy graph to latent embeddings
        z = self.encode(noisy_x)
        # Decode to reconstruct clean edge weights
        recon_x = self.decode(z)

        return recon_x, z

In [None]:
# Preprocess
features = train_connectome.drop(columns=['participant_id']).values
x = torch.tensor(features, dtype=torch.float32)
print("x shape:", x.shape) 

In [None]:
# Model parameters
input_dim = x.shape[1]  # 19900
hidden_dim = 64
latent_dim = 128
dropout = 0.3
noise_factor = 0.1
batch_size = 32

# Initialize model
model = NonGraphDAE(input_dim, hidden_dim, latent_dim, dropout)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
model = model.to(device)

# DataLoader
dataset = TensorDataset(x)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

# Training loop
num_epochs = 150
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch_x, in dataloader:
        batch_x = batch_x.to(device)
        optimizer.zero_grad()
        recon_x, z = model(batch_x)
        loss = criterion(recon_x, batch_x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.6f}')


In [None]:
torch.save(model.state_dict(), "/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/nongraph_dae.pth")

# Non-Graph Autoencoder

In [None]:
class NonGraphAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout=0.2):
        
        super(NonGraphAE, self).__init__()
        self.dropout = dropout

        # Encoder layer: learns node embeddings, compressed node into latent_dim
        self.enc1 = nn.Linear(input_dim, hidden_dim)
        self.enc2 = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder layer: Fully connected layers to predict edge weights between node pairs
        self.dec1 = nn.Linear(latent_dim, hidden_dim) 
        self.dec2 = nn.Linear(hidden_dim, input_dim)

        # Activation functions
        self.relu = nn.ReLU() # non-linearity
    
    # Encoder
    def encode(self, x):

        x = self.relu(self.enc1(x))# Encode the graph to latent node embeddings using GCN layers
        x = self.enc2(x)
        return x
    
    # Decoder
    def decode(self, z):

        x = self.relu(self.dec1(z))
        x = self.dec2(x)
        return x
    
    # Combines all steps: noise, encode, and decode
    # Returns reconstructed edge weights nad z a tensor object of latent node embeddings
    def forward(self, x):

        # Encode noisy graph to latent embeddings
        z = self.encode(x)
        # Decode to reconstruct clean edge weights
        recon_x = self.decode(z)

        return recon_x, z

In [None]:
# Model parameters
input_dim = x.shape[1]  # 19900
hidden_dim = 68
latent_dim = 128
dropout = 0.3
noise_factor = 0.1
batch_size = 32

# Initialize model
model = NonGraphAE(input_dim, hidden_dim, latent_dim, dropout)

# Move to GPU 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)
model = model.to(device)

# DataLoader
dataset = TensorDataset(x)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

# Training loop
num_epochs = 150
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch_x, in dataloader:
        batch_x = batch_x.to(device)
        optimizer.zero_grad()
        recon_x, z = model(batch_x) 
        loss = criterion(recon_x, batch_x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.6f}')

In [None]:
torch.save(model, "/Users/rubyc/Desktop/Datathon/WIDS_Datathon2025_Team/Archive/Models/nongraph_autoencoder.pth")