In [1]:
# Install PyTorch Geometric (if not installed)
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.linalg import eigh
import random

from torch_geometric.datasets import TUDataset, GNNBenchmarkDataset, ZINC
from torch_geometric.utils import to_dense_adj

In [3]:
def load_tudataset(name):
    dataset = TUDataset(root=f'data/TUDataset/{name}', name=name)
    adj_list = [to_dense_adj(g.edge_index)[0].numpy() for g in dataset] # extract list of adj matrices
    labels = [int(g.y) for g in dataset]
    return adj_list, labels

In [4]:
class DoSDataset(Dataset):
    def __init__(self, adj_list, labels, num_bins=100, sigma=0.1):
        self.labels = np.array(labels)
        self.num_bins = num_bins
        self.sigma = sigma
        self.dos_vectors = [self.compute_dos(A) for A in adj_list]
    def compute_dos(self, A):
        deg = A.sum(axis=1)
        D_inv_sqrt = np.diag(1.0 / np.sqrt(deg + 1e-10))
        L = np.eye(A.shape[0]) - (D_inv_sqrt @ A @ D_inv_sqrt)
        eigvals = eigh(L, eigvals_only=True) 
        bins = np.linspace(0, 2, self.num_bins) # the eigenvalues of the normalized Laplacian always lie in [0, 2]
        dos = np.zeros(self.num_bins)
        for ev in eigvals:
            dos += np.exp(-(bins - ev)**2 / (2 * self.sigma**2))
        dos /= dos.sum()
        return torch.tensor(dos, dtype=torch.float32)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.dos_vectors[idx], torch.tensor(self.labels[idx], dtype=torch.long)

In [5]:
class MLP(nn.Module):
    def __init__(self, in_ch, h_ch, out_ch, num_layers=2):
        super().__init__()
        layers = []
        layers.append(nn.Linear(in_ch, h_ch))
        layers.append(nn.ReLU())

        for _ in range(num_layers - 1): 
            layers.append(nn.Linear(h_ch, h_ch))
            layers.append(nn.ReLU())

        layers.append(nn.Linear(h_ch, out_ch))

        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [6]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, device):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            out = model(X)
            pred = out.argmax(dim=1).cpu().numpy()
            preds.append(pred)
            trues.append(y.cpu().numpy())
    preds = np.concatenate(preds)
    trues = np.concatenate(trues)
    return accuracy_score(trues, preds)


In [7]:
def cross_val_run_gpu(dataset, num_classes, seeds=[1,2,3,4,5], num_folds=10, 
                      hidden_dim=64, epochs=50, batch_size=32, lr=1e-3, device='cuda', num_layers=2):
    all_seed_results = []
    X_indices = np.arange(len(dataset))
    y_labels = dataset.labels
    
    for seed in seeds:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        
        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
        fold_results = []
        
        for train_idx, test_idx in skf.split(X_indices, y_labels):
            train_subset = Subset(dataset, train_idx)
            test_subset = Subset(dataset, test_idx)
            
            train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
            test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)
            
            model = MLP(dataset.num_bins, hidden_dim, num_classes, num_layers=num_layers).to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=lr)
            
            for epoch in range(epochs):
                train_one_epoch(model, train_loader, criterion, optimizer, device)
            
            acc = evaluate(model, test_loader, device)
            fold_results.append(acc)
        
        all_seed_results.append(np.mean(fold_results))
    
    mean_acc = np.mean(all_seed_results)
    std_acc = np.std(all_seed_results)
    return mean_acc, std_acc


In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

'''datasets_to_run = ['MUTAG', 'BZR', 'FRANKENSTEIN', 'NCI1', 'NCI109', 'PTC_MR', 
                   'ENZYMES', 'DD', 'PROTEINS', 'COLLAB', 'IMDB-BINARY', 'IMDB-MULTI', 
                   'TRIANGLES', 'CSL']'''
datasets_to_run = ['MUTAG', 'BZR', 'FRANKENSTEIN', 'NCI1', 'NCI109', 'PTC_MR', 
                   'ENZYMES', 'DD', 'PROTEINS', 'COLLAB']

results = {}
for name in datasets_to_run:
    print(f"\nProcessing {name}...")
    adj_list, labels = load_tudataset(name)
    dataset = DoSDataset(adj_list, labels, num_bins=100, sigma=0.01)
    num_classes = len(np.unique(labels))
    
    mean_acc, std_acc = cross_val_run_gpu(dataset, num_classes, device=device, num_layers=4)
    results[name] = (mean_acc, std_acc)
    print(f"{name}: {mean_acc:.4f} ± {std_acc:.4f}")



Processing MUTAG...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!


MUTAG: 0.8373 ± 0.0142

Processing BZR...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/BZR.zip
Processing...
Done!


BZR: 0.8282 ± 0.0122

Processing FRANKENSTEIN...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/FRANKENSTEIN.zip
Processing...
Done!


FRANKENSTEIN: 0.6729 ± 0.0061

Processing NCI1...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/NCI1.zip
Processing...
Done!


NCI1: 0.7047 ± 0.0025

Processing NCI109...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/NCI109.zip
Processing...
Done!


NCI109: 0.6988 ± 0.0028

Processing PTC_MR...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/PTC_MR.zip
Processing...
Done!


PTC_MR: 0.5876 ± 0.0123

Processing ENZYMES...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Processing...
Done!


ENZYMES: 0.2257 ± 0.0124

Processing DD...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/DD.zip
Processing...
Done!


DD: 0.6526 ± 0.0090

Processing PROTEINS...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip
Processing...
Done!


PROTEINS: 0.6996 ± 0.0082

Processing COLLAB...


Downloading https://www.chrsmrrs.com/graphkerneldatasets/COLLAB.zip
Processing...
Done!


COLLAB: 0.7392 ± 0.0035
