This is the main notebook I use for testing code and creating graphs

In [385]:
# check pytorch version
import torch
print(torch.__version__)

2.4.1+cu121


In [386]:
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid, TUDataset
from torch_geometric.nn import GCN
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.metrics import accuracy_score
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GINConv, global_add_pool, Sequential
from torch_geometric.nn import GATConv
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import OneHotDegree
from torch.optim import Adam

In [387]:
#Import the datasets
cora = Planetoid(root='/tmp/Cora', name='Cora')
imdb = TUDataset(root=f'/tmp/IMDB-BINARY', name='IMDB-BINARY')
enzymes = TUDataset(root=f'/tmp/ENZYMES', name='ENZYMES')

In [388]:
#Define a dict that we will use to store performance for each of our models on each dataset
scores = {"Cora": {}, "Imdb": {}, "Enzymes": {}}

# GCN Performance

#### Node classification

In [389]:
#Define a basic GCN for node classification
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(cora.num_node_features, 16)
        self.conv2 = GCNConv(16, cora.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [390]:
# Get the model, data, and optimizer setup in torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = cora[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [391]:
# train model for 150 epochs

model.train()
for epoch in range(151):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')

Epoch 000, Loss: 1.9532
Epoch 020, Loss: 0.1038
Epoch 040, Loss: 0.0140
Epoch 060, Loss: 0.0141
Epoch 080, Loss: 0.0167
Epoch 100, Loss: 0.0156
Epoch 120, Loss: 0.0139
Epoch 140, Loss: 0.0127


In [392]:
#evaluate the model
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
scores["Cora"]["GCN"] = acc
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8080


#### Graph Classification

In [393]:
#Define a basic GCN for graph classification
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.fc = torch.nn.Linear(16, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [394]:
# Helper fuction to make node features for the IMDB dataset
# We just represent each node with its degree
def create_degree_features(data):
    num_nodes = data.num_nodes
    degrees = torch.bincount(data.edge_index[0], minlength=num_nodes)
    
    feature_vector = degrees.view(-1, 1).float()
    
    return feature_vector

In [395]:
# Helper function to load data, train the model, and evaluate it for graph classification
def train_and_evaluate(dataset, epochs = 150):

    # Loading Data
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GCN(dataset.num_node_features or 1, dataset.num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Training
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()

            # if no node features we add in the degrees
            if data.x is None:
                data.x = create_degree_features(data).to(device)

            out = model(data.x, data.edge_index, data.batch)
            loss = F.nll_loss(out, data.y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 20 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    for data in test_loader:
        data = data.to(device)

        # if no node features we add in the degrees
        if data.x is None:
            data.x = create_degree_features(data).to(device)

        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
        total += data.num_graphs

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')
    
    return accuracy

In [396]:
# Test model on Imdb dataset

In [397]:
scores["Imdb"]["GCN"] = train_and_evaluate(imdb)

Epoch 0, Loss: 0.6858
Epoch 20, Loss: 0.6924
Epoch 40, Loss: 0.6930
Epoch 60, Loss: 0.6929
Epoch 80, Loss: 0.7000
Epoch 100, Loss: 0.6933
Epoch 120, Loss: 0.6932
Epoch 140, Loss: 0.6933
Accuracy: 0.4800


In [398]:
# Test model on Enzymes dataset

In [399]:
scores["Enzymes"]["GCN"] = train_and_evaluate(enzymes)

Epoch 0, Loss: 1.7994
Epoch 20, Loss: 1.8324
Epoch 40, Loss: 1.6149
Epoch 60, Loss: 1.8076
Epoch 80, Loss: 1.7449
Epoch 100, Loss: 1.6610
Epoch 120, Loss: 1.7488
Epoch 140, Loss: 1.5474
Accuracy: 0.2917


In [400]:
# Check the scores so far

In [401]:
scores

{'Cora': {'GCN': 0.808},
 'Imdb': {'GCN': 0.48},
 'Enzymes': {'GCN': 0.2916666666666667}}

# GIN Performance

#### Node classification

In [402]:
# Helper function to mind max degree in dataset
def find_max_degree(dataset):
    max_deg = -1
    for data in dataset:
        max_deg = max(max_deg, data.edge_index[0].bincount().max().item())
    return max_deg

In [403]:
# Get max of this dataset
cora_max = find_max_degree(cora)

In [404]:
# make a transformed dataset with the one hot encoded degree features
transformed_cora = Planetoid(root='/tmp/Cora', name='Cora', transform=OneHotDegree(cora_max))

In [405]:
#Define a GIN for node classification
class GIN(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.mlp1 = torch.nn.Sequential(
            torch.nn.Linear(transformed_cora.num_node_features, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )
        self.mlp2 = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, transformed_cora.num_classes)
        )
        
        self.conv1 = GINConv(self.mlp1, eps=0.0, train_eps=True)
        self.conv2 = GINConv(self.mlp2, eps=0.0, train_eps=True)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [406]:
# Get the model, data, and optimizer setup in torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GIN().to(device)
data = transformed_cora[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [407]:
# train model for 150 epochs

model.train()
for epoch in range(151):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')

Epoch 000, Loss: 1.9446
Epoch 020, Loss: 0.0206
Epoch 040, Loss: 0.0002
Epoch 060, Loss: 0.0001
Epoch 080, Loss: 0.0002
Epoch 100, Loss: 0.0003
Epoch 120, Loss: 0.0004
Epoch 140, Loss: 0.0005


In [408]:
#evaluate the model
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
scores["Cora"]["GIN"] = acc
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7390


#### Graph Classification

In [409]:
#Define a GIN for node classification
class GIN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()

        self.mlp1 = torch.nn.Sequential(
            torch.nn.Linear(num_node_features, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )
        self.mlp2 = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )

        self.conv1 = GINConv(self.mlp1, eps=0.0, train_eps=True)
        self.conv2 = GINConv(self.mlp2, eps=0.0, train_eps=True)

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, num_classes)
        )

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = global_add_pool(x, batch)
        x = self.fc(x)

        return F.log_softmax(x, dim=1)

In [410]:
# get max degrees for our datasets
imdb_max = find_max_degree(imdb)
enzymes_max = find_max_degree(enzymes)

In [411]:
# make a transformed datasets with the one hot encoded degree features
transformed_imdb = TUDataset(root='/tmp/IMDB', name='IMDB-BINARY', transform=OneHotDegree(imdb_max))
transformed_enzymes = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', transform=OneHotDegree(enzymes_max))

In [412]:
# Helper function to load data, train the model, and evaluate it for graph classification
def train_and_evaluate(dataset, epochs = 150):

    # Loading Data
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GIN(dataset.num_node_features or 1, dataset.num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Training
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()

            out = model(data.x, data.edge_index, data.batch)
            loss = F.nll_loss(out, data.y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 20 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    for data in test_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
        total += data.num_graphs

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')
    
    return accuracy

In [413]:
train_and_evaluate(transformed_imdb)

Epoch 0, Loss: 0.5068
Epoch 20, Loss: 0.3577
Epoch 40, Loss: 0.3429
Epoch 60, Loss: 0.4972
Epoch 80, Loss: 0.4678
Epoch 100, Loss: 0.4371
Epoch 120, Loss: 0.1724
Epoch 140, Loss: 0.4931
Accuracy: 0.7100


0.71

In [414]:
# Test model on Imdb dataset
scores["Imdb"]["GIN"] = train_and_evaluate(transformed_imdb)

Epoch 0, Loss: 0.5782
Epoch 20, Loss: 0.3879
Epoch 40, Loss: 0.3268
Epoch 60, Loss: 0.3407
Epoch 80, Loss: 0.2464
Epoch 100, Loss: 0.2655
Epoch 120, Loss: 0.3316
Epoch 140, Loss: 0.2222
Accuracy: 0.7500


In [415]:
# Test model on Enzymes dataset
scores["Enzymes"]["GIN"] = train_and_evaluate(transformed_enzymes)

Epoch 0, Loss: 1.8111
Epoch 20, Loss: 1.6782
Epoch 40, Loss: 1.5854
Epoch 60, Loss: 1.3291
Epoch 80, Loss: 1.1300
Epoch 100, Loss: 1.2712
Epoch 120, Loss: 1.0763
Epoch 140, Loss: 0.9789
Accuracy: 0.4083


In [416]:
scores

{'Cora': {'GCN': 0.808, 'GIN': 0.739},
 'Imdb': {'GCN': 0.48, 'GIN': 0.75},
 'Enzymes': {'GCN': 0.2916666666666667, 'GIN': 0.4083333333333333}}

# GAT Performance

#### Node classification GCN baseline

In [417]:
# We define a new GCN that can handle a dynamic number of layers
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
        x = self.convs[-1](x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [418]:
#we turn everything into neat helper functions since we are going to be training a lot of models here

def train_model(model, data, optimizer, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
    
    return model

def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        train_correct = (pred[data.train_mask] == data.y[data.train_mask]).sum()
        val_correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
        test_correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
        
        train_acc = float(train_correct) / int(data.train_mask.sum())
        val_acc = float(val_correct) / int(data.val_mask.sum())
        test_acc = float(test_correct) / int(data.test_mask.sum())
        
    return train_acc, val_acc, test_acc

In [419]:
# Test different numbers of layers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = cora[0].to(device)

results = {}
for num_layers in range(1, 16):
    print(f"Training model with {num_layers} layers")
    
    model = GCN(
        in_channels=cora.num_node_features,
        hidden_channels=16,
        out_channels=cora.num_classes,
        num_layers=num_layers
    ).to(device)
    
    optimizer = Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    model = train_model(model, data, optimizer)
    
    train_acc, val_acc, test_acc = evaluate_model(model, data)
    
    results[num_layers] = {
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc
    }

print("\n" + "="*50)
print("Summary of Results")
print("="*50)
print("\nNumber of Layers | Train Acc | Val Acc | Test Acc")
print("-"*50)
for layers in range(1, 16):
    r = results[layers]
    print(f"{layers:14d} | {r['train_acc']:.4f} | {r['val_acc']:.4f} | {r['test_acc']:.4f}")

Training model with 1 layers
Training model with 2 layers
Training model with 3 layers
Training model with 4 layers
Training model with 5 layers
Training model with 6 layers
Training model with 7 layers
Training model with 8 layers
Training model with 9 layers
Training model with 10 layers
Training model with 11 layers
Training model with 12 layers
Training model with 13 layers
Training model with 14 layers
Training model with 15 layers

Summary of Results

Number of Layers | Train Acc | Val Acc | Test Acc
--------------------------------------------------
             1 | 1.0000 | 0.7760 | 0.8090
             2 | 1.0000 | 0.7840 | 0.7960
             3 | 1.0000 | 0.7860 | 0.8160
             4 | 1.0000 | 0.7580 | 0.7680
             5 | 1.0000 | 0.7460 | 0.7450
             6 | 1.0000 | 0.7240 | 0.7360
             7 | 1.0000 | 0.7240 | 0.7240
             8 | 1.0000 | 0.6880 | 0.7070
             9 | 1.0000 | 0.6880 | 0.6780
            10 | 0.8429 | 0.5120 | 0.5570
            11 | 

#### Node classification GAT model

In [420]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, heads=8):
        super().__init__()
        
        self.convs = torch.nn.ModuleList()
        
        self.convs.append(GATConv(
            in_channels, 
            hidden_channels, 
            heads=heads, 
            concat=True, 
        ))
        
        hidden_in_channels = hidden_channels * heads
        for _ in range(num_layers - 2):
            self.convs.append(GATConv(
                hidden_in_channels,
                hidden_channels,
                heads=heads,
                concat=True,
            ))
            
        self.convs.append(GATConv(
            hidden_in_channels,
            out_channels,
            heads=heads,
            concat=False,
        ))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs[:-1]:
            x = F.elu(conv(x, edge_index))
        x = self.convs[-1](x, edge_index)
        
        return F.log_softmax(x, dim=1)


In [421]:
# Test different numbers of layers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = cora[0].to(device)

results = {}
for num_layers in range(1, 16):
    print(f"Training model with {num_layers} layers")
    
    model = GAT(
        in_channels=cora.num_node_features,
        hidden_channels=16,
        out_channels=cora.num_classes,
        num_layers=num_layers
    ).to(device)
    
    optimizer = Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    model = train_model(model, data, optimizer)
    
    train_acc, val_acc, test_acc = evaluate_model(model, data)
    
    results[num_layers] = {
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc
    }

print("\n" + "="*50)
print("Summary of Results")
print("="*50)
print("\nNumber of Layers | Train Acc | Val Acc | Test Acc")
print("-"*50)
for layers in range(1, 16):
    r = results[layers]
    print(f"{layers:14d} | {r['train_acc']:.4f} | {r['val_acc']:.4f} | {r['test_acc']:.4f}")

Training model with 1 layers
Training model with 2 layers
Training model with 3 layers
Training model with 4 layers
Training model with 5 layers
Training model with 6 layers
Training model with 7 layers
Training model with 8 layers
Training model with 9 layers
Training model with 10 layers
Training model with 11 layers
Training model with 12 layers
Training model with 13 layers
Training model with 14 layers
Training model with 15 layers

Summary of Results

Number of Layers | Train Acc | Val Acc | Test Acc
--------------------------------------------------
             1 | 1.0000 | 0.7460 | 0.7840
             2 | 1.0000 | 0.7440 | 0.7920
             3 | 1.0000 | 0.7600 | 0.7980
             4 | 1.0000 | 0.7700 | 0.7880
             5 | 1.0000 | 0.7700 | 0.7640
             6 | 1.0000 | 0.7640 | 0.7710
             7 | 1.0000 | 0.7520 | 0.7530
             8 | 1.0000 | 0.7700 | 0.7700
             9 | 1.0000 | 0.7540 | 0.7700
            10 | 1.0000 | 0.7580 | 0.7560
            11 | 

In [424]:
scores

{'Cora': {'GCN': 0.808, 'GIN': 0.739},
 'Imdb': {'GCN': 0.48, 'GIN': 0.75},
 'Enzymes': {'GCN': 0.2916666666666667, 'GIN': 0.4083333333333333}}