In [1]:
# Combining Attention values across heads - Avg
# Combining Attention values across layers - Matrix Multiply

# Setup

In [2]:
# !pip install dgl torch_geometric torch

# Install required python libraries
import os

# Install PyTorch Geometric and other libraries
# if 'IS_GRADESCOPE_ENV' not in os.environ:
#     print("Installing PyTorch Geometric")
#     !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
#     !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cu121.html
#     !pip install -q torch-geometric
#     print("Installing other libraries")
#     !pip install networkx
#     !pip install lovely-tensors

In [3]:
import os
import sys
import time
import math
import random
import itertools
from datetime import datetime
from typing import Mapping, Tuple, Sequence, List

import pandas as pd
import networkx as nx
import numpy as np
import scipy as sp

from tqdm.notebook import tqdm

import torch
import torch.nn.functional as F
from torch.nn import Embedding, Linear, ReLU, BatchNorm1d, LayerNorm, Module, ModuleList, Sequential
from torch.nn import TransformerEncoder, TransformerEncoderLayer, MultiheadAttention
from torch.optim import Adam

import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import Planetoid

import torch_geometric.transforms as T
from torch_geometric.utils import remove_self_loops, dense_to_sparse, to_dense_batch, to_dense_adj

from torch_geometric.nn import GCNConv, GATConv, GATv2Conv

# from torch_scatter import scatter, scatter_mean, scatter_max, scatter_sum

import lovely_tensors as lt
lt.monkey_patch()

import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore", category=RuntimeWarning)
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

print("All imports succeeded.")
print("Python version {}".format(sys.version))
print("PyTorch version {}".format(torch.__version__))
print("PyG version {}".format(torch_geometric.__version__))



All imports succeeded.
Python version 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]
PyTorch version 2.6.0+cu118
PyG version 2.6.1


In [4]:
# Set random seed for deterministic results

def seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed(0)
print("All seeds set.")

All seeds set.


In [5]:
print("Cuda available: {}".format(torch.cuda.is_available()))

Cuda available: True


# Datasets

In [6]:
# use igraph because much faster than networkx
import igraph as ig
from tqdm import tqdm

def add_position_info(dataset, pe_dim=16):
    
    processed_dataset = []
    pe_transform = T.AddLaplacianEigenvectorPE(k=pe_dim, attr_name='pos_enc')

    for data in tqdm(dataset, total=len(dataset), desc='Processing disjoint graphs'):
        num_nodes = data.num_nodes
        edge_index = data.edge_index

        # Convert to igraph
        edges = edge_index.t().tolist()
        g = ig.Graph(n=num_nodes, edges=edges, directed=False)

        # compute shortest path distances
        sp_matrix = torch.tensor(g.distances(algorithm="johnson"), dtype=torch.float16)
        sp_matrix[torch.isinf(sp_matrix)] = 0
        data.dense_sp_matrix = sp_matrix

        dense_adj = to_dense_adj(edge_index, max_num_nodes=num_nodes)[0]
        dense_adj = dense_adj + torch.eye(num_nodes, dtype=dense_adj.dtype)
        dense_adj[dense_adj == 2] = 1  # remove double self-loops
        data.dense_adj = dense_adj

        # add Laplacian eigenvectors as positional encoding
        data = pe_transform(data)

        processed_dataset.append(data)

    return processed_dataset

In [7]:
from torch_geometric.datasets import PPI
from torch_geometric.loader import DataLoader

train = add_position_info(PPI(root='/tmp/PPI', split='train'))
val = add_position_info(PPI(root='/tmp/PPI', split='val'))
test = add_position_info(PPI(root='/tmp/PPI', split='test'))

train = DataLoader(train, batch_size=1, shuffle=True)
val = DataLoader(val, batch_size=1)
test = DataLoader(test, batch_size=1)

Processing disjoint graphs: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Processing disjoint graphs: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
Processing disjoint graphs: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


In [8]:
data = next(iter(train))

In [9]:
ex_spd = data.dense_sp_matrix
ex_spd.shape

torch.Size([3021, 3021])

In [10]:
tensor_memory_bytes = ex_spd.element_size() * ex_spd.numel()
tensor_memory_MB = tensor_memory_bytes / (1024 ** 2)  # Convert to MB
print(f"Tensor memory usage: {tensor_memory_MB:.2f} MB")

Tensor memory usage: 17.41 MB


In [11]:
data

DataBatch(x=[3021, 50], edge_index=[2, 91338], y=[3021, 121], dense_sp_matrix=[3021, 3021], dense_adj=[3021, 3021], pos_enc=[3021, 16], batch=[3021], ptr=[2])

## Table 1: Dataset Statistics

In [12]:
# ### Table 1 ###
# ### Dataset Statistics ###
# import dgl
# Homophily_Levels = []
# 
# for data in train:
#   edge_index_tensor = torch.tensor(data.edge_index.cpu().numpy(), dtype=torch.long)
#   g = dgl.graph((edge_index_tensor[0], edge_index_tensor[1]), num_nodes=data.x.shape[0])
#   g.ndata['y'] = torch.tensor(data.y.cpu().numpy(), dtype=torch.long)
#   Homophily_Levels.append({'Node Homophily':dgl.node_homophily(g, g.ndata['y'])*100,
#                                 'Edge Homophily':dgl.edge_homophily(g, g.ndata['y'])*100,
#                                 'Adjusted Homophily':dgl.adjusted_homophily(g, g.ndata['y'])*100,
#                                 'Number of Nodes': int(g.num_nodes()),
#                                 'Number of Edges': int(g.num_edges())
#                                 })
# df = pd.DataFrame(Homophily_Levels).round(1)
# df

# Models

In [28]:
# PyG example code: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/gcn2_cora.py

class GNNModel(Module):

    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            hidden_dim: int = 256,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)
        self.layers = ModuleList()

        for layer in range(num_layers):
            self.layers.append(
                GCNConv(hidden_dim, hidden_dim)
            )
        self.dropout = dropout

    def forward(self, x, edge_index):

        x = self.lin_in(x)

        for layer in self.layers:
            # conv -> activation ->  dropout -> residual
            x_in = x
            x = layer(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

        x = self.lin_out(x)
        # return x.log_softmax(dim=-1)
        return x


class SparseGraphTransformerModel(Module):
    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            hidden_dim: int = 256,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )
        self.dropout = dropout

    def forward(self, x, dense_adj):

        x = self.lin_in(x)

        self.attn_weights_list = []

        for layer in self.layers:
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                attn_mask = ~dense_adj.bool(),
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)

        # return x.log_softmax(dim=-1)
        return x

class DenseGraphTransformerModel(Module):

    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            pos_enc_dim: int = 16,
            hidden_dim: int = 256,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_pos_enc = Linear(pos_enc_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )

        self.attn_bias_scale = torch.nn.Parameter(torch.tensor([10.0]))  # controls how much we initially bias our model to nearby nodes
        self.dropout = dropout

    def forward(self, x, pos_enc, dense_sp_matrix):

        # x = self.lin_in(x) + self.lin_pos_enc(pos_enc)
        x = self.lin_in(x)  # no node positional encoding

        # attention bias
        # [i, j] -> inverse of shortest path distance b/w node i and j
        # diagonals -> self connection, set to 0
        # disconnected nodes -> -1
        attn_bias = self.attn_bias_scale * torch.nan_to_num(
            (1 / (torch.nan_to_num(dense_sp_matrix, nan=-1, posinf=-1, neginf=-1))),
            nan=0, posinf=0, neginf=0)
        #attn_bias = torch.ones_like(attn_bias)

        # TransformerEncoder
        # x = self.encoder(x, mask = attn_bias)

        self.attn_weights_list = []

        for layer in self.layers:
            # MHSA layer
            # float mask adds learnable additive attention bias
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                attn_mask = attn_bias,
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)
        # return x.log_softmax(dim=-1)
        return x



class DenseGraphTransformerModel_V2(Module):
    def __init__(
            self,
            in_dim: int = data.x.shape[-1],
            pos_enc_dim: int = 16,
            hidden_dim: int = 256,
            num_heads: int = 1,
            num_layers: int = 1,
            out_dim: int = len(data.y.unique()),
            dropout: float = 0.5,
        ):
        super().__init__()

        self.lin_in = Linear(in_dim, hidden_dim)
        self.lin_pos_enc = Linear(pos_enc_dim, hidden_dim)
        self.lin_out = Linear(hidden_dim, out_dim)

        self.layers = ModuleList()
        for layer in range(num_layers):
            self.layers.append(
                MultiheadAttention(
                    embed_dim = hidden_dim,
                    num_heads = num_heads,
                    dropout = dropout
                )
            )

        self.attn_bias_scale = torch.nn.Parameter(torch.tensor([10.0]))  # controls how much we initially bias our model to nearby nodes
        self.dropout = dropout

    def forward(self, x, pos_enc, dense_sp_matrix):

        x = self.lin_in(x) + self.lin_pos_enc(pos_enc)

        self.attn_weights_list = []

        for layer in self.layers:

            # MHSA layer
            # float mask adds learnable additive attention bias
            x_in = x
            x, attn_weights = layer(
                x, x, x,
                average_attn_weights = False
            )
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            x = x_in + x

            self.attn_weights_list.append(attn_weights)

        x = self.lin_out(x)

        # return x.log_softmax(dim=-1)
        return x

# Trainers

In [14]:
from sklearn.metrics import f1_score

In [15]:
def Train_GCN(NUM_LAYERS, NUM_HEADS, train_loader, val_loader, test_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    IN_DIM = train_loader.dataset[0].x.shape[1]
    OUT_DIM = train_loader.dataset[0].y.shape[1]  # 121 for PPI

    model = GNNModel(num_layers=NUM_LAYERS, num_heads=NUM_HEADS, in_dim=IN_DIM, out_dim=OUT_DIM).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.5)

    def train():
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.edge_index)
            loss = loss_fn(out, batch.y.float())  # y is multi-label
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(train_loader)

    def test(loader):
        model.eval()
        y_true, y_pred = [], []
        
        for batch in loader:
            batch = batch.to(device)
            logits = model(batch.x, batch.edge_index)          # or …dense_adj / …spd
            probs  = torch.sigmoid(logits)                     # convert logits → probabilities
            preds  = (probs > 0.5).cpu().numpy().astype(int)   # threshold at 0.5
            y_true.append(batch.y.cpu().numpy())
            y_pred.append(preds)

        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        micro_f1 = f1_score(y_true, y_pred, average="micro")
        return micro_f1

    best_val_f1 = test_f1 = 0
    times = []

    for epoch in range(1, EPOCHS):
        start = time.time()
        loss = train()
        scheduler.step()
        train_f1 = test(train_loader)
        val_f1 = test(val_loader)
        tmp_test_f1 = test(test_loader)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            test_f1 = tmp_test_f1

        times.append(time.time() - start)
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}, Train: {train_f1:.4f}, Val: {val_f1:.4f}, Test: {test_f1:.4f}')

    return {
        'train_f1': train_f1,
        'val_f1': val_f1,
        'test_f1': test_f1
    }, None


def Train_SparseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train_loader, val_loader, test_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Get dimensions from dataset
    IN_DIM = train_loader.dataset[0].x.shape[1]
    OUT_DIM = train_loader.dataset[0].y.shape[1]

    model = SparseGraphTransformerModel(
        num_layers=NUM_LAYERS,
        num_heads=NUM_HEADS,
        in_dim=IN_DIM,
        out_dim=OUT_DIM
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.5)

    def train():
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.dense_adj)
            loss = loss_fn(out, batch.y.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(train_loader)

    def test(loader):
        model.eval()
        y_true, y_pred = [], []

        for batch in loader:
            batch = batch.to(device)
            logits = model(batch.x, batch.dense_adj)          # or …dense_adj / …spd
            probs  = torch.sigmoid(logits)                     # convert logits → probabilities
            preds  = (probs > 0.5).cpu().numpy().astype(int)   # threshold at 0.5
            y_true.append(batch.y.cpu().numpy())
            y_pred.append(preds)

        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        micro_f1 = f1_score(y_true, y_pred, average="micro")
        return micro_f1

    best_val_f1 = test_f1 = 0
    times = []

    for epoch in range(1, EPOCHS):
        start = time.time()
        loss = train()
        scheduler.step()
        train_f1 = test(train_loader)
        val_f1 = test(val_loader)
        tmp_test_f1 = test(test_loader)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            test_f1 = tmp_test_f1

        times.append(time.time() - start)

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}, Train: {train_f1:.4f}, Val: {val_f1:.4f}, Test: {test_f1:.4f}')

    return {
        'train_f1': train_f1,
        'val_f1': val_f1,
        'test_f1': test_f1
    }, model.attn_weights_list  # assuming your model exposes attention weights this way

def Train_DenseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train_loader, val_loader, test_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    IN_DIM = train_loader.dataset[0].x.shape[1]
    OUT_DIM = train_loader.dataset[0].y.shape[1]

    model = DenseGraphTransformerModel(
        num_layers=NUM_LAYERS,
        num_heads=NUM_HEADS,
        in_dim=IN_DIM,
        out_dim=OUT_DIM
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.5)
    
    def train():
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.pos_enc, batch.dense_sp_matrix)  # batch.spd is dense_sp_matrix
            loss = loss_fn(out, batch.y.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(train_loader)

    def test(loader):
        model.eval()
        y_true, y_pred = [], []

        for batch in loader:
            batch = batch.to(device)
            logits = model(batch.x, batch.pos_enc, batch.dense_sp_matrix)          # or …dense_adj / …spd
            probs  = torch.sigmoid(logits)                     # convert logits → probabilities
            preds  = (probs > 0.5).cpu().numpy().astype(int)   # threshold at 0.5
            y_true.append(batch.y.cpu().numpy())
            y_pred.append(preds)

        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        micro_f1 = f1_score(y_true, y_pred, average="micro")
        return micro_f1

    best_val_f1 = test_f1 = 0
    times = []

    for epoch in range(1, EPOCHS):
        start = time.time()
        loss = train()
        scheduler.step()
        train_f1 = test(train_loader)
        val_f1 = test(val_loader)
        tmp_test_f1 = test(test_loader)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            test_f1 = tmp_test_f1

        times.append(time.time() - start)

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}, Train: {train_f1:.4f}, Val: {val_f1:.4f}, Test: {test_f1:.4f}')

    return {
        'train_f1': train_f1,
        'val_f1': val_f1,
        'test_f1': test_f1
    }, model.attn_weights_list  # if model tracks attention weights
    
    # Notes
    # - Dense Transformer needs to be trained for a bit longer to reach low loss value
    # - Node positional encodings are not particularly useful
    # - Edge distance encodings are very useful
    # - Since Cora is highly homophilic, it is important to bias the attention towards nearby nodes

def Train_DenseGraphTransformerModel_V2(NUM_LAYERS, NUM_HEADS, train_loader, val_loader, test_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    IN_DIM = train_loader.dataset[0].x.shape[1]
    OUT_DIM = train_loader.dataset[0].y.shape[1]

    model = DenseGraphTransformerModel_V2(
        num_layers=NUM_LAYERS,
        num_heads=NUM_HEADS,
        in_dim=IN_DIM,
        out_dim=OUT_DIM
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=.5)
    
    def train():
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x, batch.pos_enc, batch.dense_sp_matrix)  # batch.spd is dense_sp_matrix
            loss = loss_fn(out, batch.y.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(train_loader)

    def test(loader):
        model.eval()
        y_true, y_pred = [], []

        for batch in loader:
            batch = batch.to(device)
            logits = model(batch.x, batch.pos_enc, batch.dense_sp_matrix)          # or …dense_adj / …spd
            probs  = torch.sigmoid(logits)                     # convert logits → probabilities
            preds  = (probs > 0.5).cpu().numpy().astype(int)   # threshold at 0.5
            y_true.append(batch.y.cpu().numpy())
            y_pred.append(preds)

        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        micro_f1 = f1_score(y_true, y_pred, average="micro")
        return micro_f1

    best_val_f1 = test_f1 = 0
    times = []

    for epoch in range(1, EPOCHS):
        start = time.time()
        loss = train()
        scheduler.step()
        train_f1 = test(train_loader)
        val_f1 = test(val_loader)
        tmp_test_f1 = test(test_loader)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            test_f1 = tmp_test_f1

        times.append(time.time() - start)

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}, Train: {train_f1:.4f}, Val: {val_f1:.4f}, Test: {test_f1:.4f}')

    return {
        'train_f1': train_f1,
        'val_f1': val_f1,
        'test_f1': test_f1
    }, model.attn_weights_list  # if model tracks attention weights

# Training

## Training: 1 Layer, 1 Head

In [22]:
EPOCHS = 300
LR = 2e-3

In [17]:
# NUM_LAYERS = 1
# NUM_HEADS = 1
# NUM_RUNS = 1
# 
# run_stats = {}
# 
# for run in tqdm(range(NUM_RUNS), total=NUM_RUNS, desc='Runs'):
#     accuracy_statistics = {}
#     attn_weights = {}
# 
#     # Train all models with the same loaders
#     # accuracy_statistics['GCN'], attn_weights['GCN'] = Train_GCN(NUM_LAYERS, NUM_HEADS, train, val, test)
#     accuracy_statistics['SparseGraphTransformerModel'], attn_weights['SparseGraphTransformerModel'] = Train_SparseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train, val, test)
#     accuracy_statistics['DenseGraphTransformerModel'], attn_weights['DenseGraphTransformerModel'] = Train_DenseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train, val, test)
#     # accuracy_statistics['DenseGraphTransformerModel_V2'], attn_weights['DenseGraphTransformerModel_V2'] = Train_DenseGraphTransformerModel_V2(NUM_LAYERS, NUM_HEADS, train, val, test)
# 
#     # Convert attention weights to CPU tensors
#     for key in attn_weights.keys():
#         if attn_weights[key]:
#             attn_weights[key] = torch.stack(attn_weights[key]).cpu()
# 
#     run_stats[run] = {'f1': accuracy_statistics, 'attentions': attn_weights}
#     
# run_stats

## Training: 1 Layer, 2 Heads

In [29]:
NUM_LAYERS = 4
NUM_HEADS = 4
NUM_RUNS = 1

run_stats = {}

for run in tqdm(range(NUM_RUNS), total=NUM_RUNS, desc='Runs'):
    accuracy_statistics = {}
    attn_weights = {}

    # Train all models with the same loaders
    accuracy_statistics['GCN'], attn_weights['GCN'] = Train_GCN(NUM_LAYERS, NUM_HEADS, train, val, test)
    accuracy_statistics['SparseGraphTransformerModel'], attn_weights['SparseGraphTransformerModel'] = Train_SparseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train, val, test)
    accuracy_statistics['DenseGraphTransformerModel'], attn_weights['DenseGraphTransformerModel'] = Train_DenseGraphTransformerModel(NUM_LAYERS, NUM_HEADS, train, val, test)
    accuracy_statistics['DenseGraphTransformerModel_V2'], attn_weights['DenseGraphTransformerModel_V2'] = Train_DenseGraphTransformerModel_V2(NUM_LAYERS, NUM_HEADS, train, val, test)

    # Convert attention weights to CPU tensors
    for key in attn_weights.keys():
        if attn_weights[key]:
            attn_weights[key] = torch.stack(attn_weights[key]).cpu()

    run_stats[run] = {'f1': accuracy_statistics, 'attentions': attn_weights}
    
run_stats

Runs:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 10, Loss: 0.4609, Train: 0.5768, Val: 0.5533, Test: 0.5878
Epoch 20, Loss: 0.4218, Train: 0.6679, Val: 0.6399, Test: 0.6596
Epoch 30, Loss: 0.3941, Train: 0.7238, Val: 0.6890, Test: 0.7111
Epoch 40, Loss: 0.3799, Train: 0.7367, Val: 0.6999, Test: 0.7227
Epoch 50, Loss: 0.3714, Train: 0.7450, Val: 0.7067, Test: 0.7356
Epoch 60, Loss: 0.3649, Train: 0.7556, Val: 0.7184, Test: 0.7462
Epoch 70, Loss: 0.3582, Train: 0.7662, Val: 0.7271, Test: 0.7521
Epoch 80, Loss: 0.3546, Train: 0.7697, Val: 0.7301, Test: 0.7580
Epoch 90, Loss: 0.3528, Train: 0.7737, Val: 0.7338, Test: 0.7595
Epoch 100, Loss: 0.3503, Train: 0.7762, Val: 0.7366, Test: 0.7623
Epoch 110, Loss: 0.3491, Train: 0.7784, Val: 0.7388, Test: 0.7649
Epoch 120, Loss: 0.3466, Train: 0.7789, Val: 0.7389, Test: 0.7653
Epoch 130, Loss: 0.3458, Train: 0.7806, Val: 0.7404, Test: 0.7672
Epoch 140, Loss: 0.3455, Train: 0.7812, Val: 0.7412, Test: 0.7681
Epoch 150, Loss: 0.3443, Train: 0.7811, Val: 0.7410, Test: 0.7695
Epoch 160, Loss: 0.

Runs: 100%|██████████| 1/1 [39:45<00:00, 2385.13s/it]


{0: {'f1': {'GCN': {'train_f1': 0.7854119414722556,
    'val_f1': 0.7454335609373857,
    'test_f1': 0.771457050590865},
   'SparseGraphTransformerModel': {'train_f1': 0.8465170344978604,
    'val_f1': 0.8161225530135022,
    'test_f1': 0.8362625043336547},
   'DenseGraphTransformerModel': {'train_f1': 0.8431454813962813,
    'val_f1': 0.8066145906362654,
    'test_f1': 0.8284330456124078},
   'DenseGraphTransformerModel_V2': {'train_f1': 0.5501579516752433,
    'val_f1': 0.5271508220627498,
    'test_f1': 0.5417549572166261}},
  'attentions': {'GCN': None,
   'SparseGraphTransformerModel': tensor[4, 4, 2300, 2300] n=84640000 (0.3Gb) x∈[0., 1.000] μ=0.000 σ=0.012 grad ToCopyBackward0,
   'DenseGraphTransformerModel': tensor[4, 4, 2300, 2300] n=84640000 (0.3Gb) x∈[2.019e-32, 0.999] μ=0.000 σ=0.007 grad ToCopyBackward0,
   'DenseGraphTransformerModel_V2': tensor[4, 4, 2300, 2300] n=84640000 (0.3Gb) x∈[0., 1.000] μ=0.000 σ=0.002 grad ToCopyBackward0}}}

## Training: 2 Layers, 1 Head

## Training: 2 Layers, 2 Heads

# Analysis


## Table 2: Accuracy Statistics

In [19]:
### Table 2 ###
### Accuracy Statistics ###
pd.set_option('display.max_columns', None)

all_stats_df = {}
for data_key in all_stats:
  run_stats = all_stats[data_key]
  table1 = pd.concat({key : pd.DataFrame(run_stats[key]['accuracy']) for key in run_stats}, axis=0)
  table1_train = pd.concat({'mean': table1.mean(level=1, axis=0).loc['train_acc'], 'std':table1.std(level=1).loc['train_acc']}, axis=1)
  table1_test = pd.concat({'mean': table1.mean(level=1, axis=0).loc['test_acc'], 'std':table1.std(level=1).loc['test_acc']}, axis=1)
  # table1 = pd.concat({'Train': table1_train, 'Test': table1_test}, axis=1)
  table1 = table1_test
  all_stats_df[data_key] = table1
pd.concat(all_stats_df, axis=1).round(2)

NameError: name 'all_stats' is not defined