In [1]:
import os
import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
from tqdm import tqdm

In [3]:
# Create saving directories if they do not exist
if not os.path.exists("./training-runs"):
    os.mkdir("./training-runs")

if not os.path.exists(os.path.join("./training-runs", "gat")):
    os.mkdir(os.path.join("./training-runs", "gat"))

In [4]:
# Create experiment directory for this run
date = datetime.datetime.now().strftime('%Y-%m-%d-%H_%M_%S')
SAVE_PATH = os.path.join("./training-runs", "gat", date)
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)

# Load dataset

In [5]:
cora = Planetoid(root="./", name="Cora", split="public")
cora_dataset = cora[0]
cora_dataset

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [6]:
# Print size of train, validation, and test set
print(cora_dataset.train_mask.sum())
print(cora_dataset.val_mask.sum())
print(cora_dataset.test_mask.sum())

tensor(140)
tensor(500)
tensor(1000)


# Define Hyperparameters

In [7]:
args = {
    "learning_rate": 0.01,
    "num_epochs": 100,
    "hidden_size": 64,
    "experiment description": "Two-hop GAT Network, Adam optimizer."
}

# Define Network and Optimizer

In [8]:
class GAT(torch.nn.Module):
    def __init__(self, input_features, hidden_size, num_classes, training=True):
        super().__init__()
        # Model definition follows GAT architecture described in "Graph Attention Networks", Velickovic et al.:
        # https://arxiv.org/pdf/1710.10903.pdf

        self.conv1 = GATConv(in_channels=input_features, out_channels=hidden_size // 8, heads=8)
        self.conv2 = GATConv(hidden_size, num_classes, heads=1)

        self.act1 = nn.ELU()
        self.drop1 = nn.Dropout(p=0.6)
        self.drop2 = nn.Dropout(p=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.drop1(x)
        x, (edge_idx1, alphas1) = self.conv1(x, edge_index, return_attention_weights=True)
        x = self.act1(x)
        
        x = self.drop2(x)
        x, (edge_idx2, alphas2) = self.conv2(x, edge_index, return_attention_weights=True)
        return F.softmax(x, dim=1), (edge_idx1, alphas1, edge_idx2, alphas2)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAT(cora_dataset.num_node_features, hidden_size=args["hidden_size"], num_classes=cora.num_classes).to(device)

cora_dataset = cora_dataset.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args["learning_rate"], weight_decay=5e-4)
# optimizer = torch.optim.SGD(model.parameters(), lr=args["learning_rate"], momentum=0.9)

# Define Visualization Code

In [10]:
def plot_loss_curves(train_losses, val_losses):
    assert len(train_losses) == len(val_losses), "Inconsistent plotting sizes."
    
    time = list(range(args["num_epochs"]))
    visual_df = pd.DataFrame({
        "Train Loss": train_losses,
        "Validation Loss": val_losses,
        "Epoch": time
    })

    sns.lineplot(x='Epoch', y='Loss Value', hue='Dataset Split', data=pd.melt(visual_df, ['Epoch'], value_name="Loss Value", var_name="Dataset Split"))
    plt.title("Loss Curves")
    plt.savefig(os.path.join(SAVE_PATH, "loss_curves.png"), bbox_inches='tight', facecolor="white")
    plt.clf()
    plt.close()

In [11]:
def plot_accuracy_curves(train_acc, val_acc):
    assert len(train_acc) == len(val_acc), "Inconsistent plotting sizes."
    
    time = list(range(args["num_epochs"]))
    visual_df = pd.DataFrame({
        "Train Accuracy": train_acc,
        "Validation Accuracy": val_acc,
        "Epoch": time
    })

    sns.lineplot(x='Epoch', y='Accuracy', hue='Dataset Split', data=pd.melt(visual_df, ['Epoch'], value_name="Accuracy", var_name="Dataset Split"))
    plt.title("Accuracy Curves")
    plt.savefig(os.path.join(SAVE_PATH, "accuracy_curves.png"), bbox_inches='tight', facecolor="white")
    plt.clf()
    plt.close()

# Define Training and Evaluation Code

In [12]:
def train(model, cora_dataset):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    pbar = tqdm(total=args["num_epochs"])
    pbar.set_description(f'Epoch')

    for epoch in range(args["num_epochs"]):
        model.train()
        optimizer.zero_grad()

        out, _ = model(cora_dataset)  # Pass entire graph through model
        loss = F.nll_loss(out[cora_dataset.train_mask], cora_dataset.y[cora_dataset.train_mask])
        loss.backward()
        optimizer.step()

        pred = out.argmax(dim=1)
        correct = (pred[cora_dataset.train_mask] == cora_dataset.y[cora_dataset.train_mask]).sum()
        train_acc = int(correct) / int(cora_dataset.train_mask.sum())

        train_losses.append(loss.item())
        train_accuracies.append(train_acc)

        # Validate once per epoch
        val_loss, val_acc = validation(model, cora_dataset)
        val_losses.append(val_loss.item())
        val_accuracies.append(val_acc)

        pbar.update(epoch)
    
    pbar.close()
    
    # Assert that sizes are all the same
    assert len(train_losses) == len(val_losses) == len(train_accuracies) == len(val_accuracies), "Metric list sizes are inconsistent."
    plot_loss_curves(train_losses, val_losses)
    plot_accuracy_curves(train_accuracies, val_accuracies)


In [13]:
def validation(model, cora_dataset):
    model.eval()
    out, _ = model(cora_dataset)
    pred = out.argmax(dim=1)
    
    val_loss = F.nll_loss(out[cora_dataset.val_mask], cora_dataset.y[cora_dataset.val_mask])
    correct = (pred[cora_dataset.val_mask] == cora_dataset.y[cora_dataset.val_mask]).sum()
    val_acc = int(correct) / int(cora_dataset.val_mask.sum())
    
    return val_loss, val_acc


In [14]:
def test(model, cora_dataset):
    model.eval()
    out, edge_attn_info = model(cora_dataset)
    pred = out.argmax(dim=1)
    correct = (pred[cora_dataset.test_mask] == cora_dataset.y[cora_dataset.test_mask]).sum()
    accuracy = int(correct) / int(cora_dataset.test_mask.sum())
    print(f'Test Set Accuracy: {accuracy:.4f}')
    return accuracy, edge_attn_info

# Driver Code

In [15]:
train(model, cora_dataset)

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 0 starting...


Epoch: : 210it [00:02, 166.61it/s]                     

Epoch 20 starting...


Epoch: : 820it [00:03, 376.57it/s]

Epoch 40 starting...


Epoch: : 1830it [00:05, 566.04it/s]

Epoch 60 starting...


Epoch: : 3240it [00:07, 830.05it/s]

Epoch 80 starting...


Epoch: : 4950it [00:09, 527.58it/s] 


In [16]:
test_acc, edge_attn_info = test(model, cora_dataset)

# Save test accuracy so that we log it somewhere. Train and val accuracy are kept in the accuracy curves
args["test accuracy"] = test_acc

Test Set Accuracy: 0.8080


With Graph Attention Networks, we can extract attention scores computed between nodes in the graph. These scores are an interpretable similarity measure computed by the attention mechanism defined in the "Graph Attention Networks" publication.

These attention scores can serve as edge weights if we so desire, or we could train auxillary classifiers to classify the scores into link predictions. Any future analysis of link-level tasks in the graph-structured data could be done through these attention scores.

In [17]:
(edge_idx1, alphas1, edge_idx2, alphas2) = edge_attn_info
print(edge_idx1.shape, alphas1.shape, edge_idx2.shape, alphas2.shape)

torch.Size([2, 13264]) torch.Size([13264, 8]) torch.Size([2, 13264]) torch.Size([13264, 1])


In [18]:
print("GAT Layer 1 Attention Scores:")
for i in range(15):
    print("Average GAT attention score from node {} to node {} is {:.4f}".format(edge_idx1[0,i], edge_idx1[1,i], alphas1[i,:].mean()))

GAT Layer 1 Attention Scores:
Average GAT attention score from node 0 to node 633 is 0.2268
Average GAT attention score from node 0 to node 1862 is 0.1792
Average GAT attention score from node 0 to node 2582 is 0.2378
Average GAT attention score from node 1 to node 2 is 0.2019
Average GAT attention score from node 1 to node 652 is 0.3429
Average GAT attention score from node 1 to node 654 is 0.4538
Average GAT attention score from node 2 to node 1 is 0.2340
Average GAT attention score from node 2 to node 332 is 0.1857
Average GAT attention score from node 2 to node 1454 is 0.4882
Average GAT attention score from node 2 to node 1666 is 0.1591
Average GAT attention score from node 2 to node 1986 is 0.0195
Average GAT attention score from node 3 to node 2544 is 0.5116
Average GAT attention score from node 4 to node 1016 is 0.1894
Average GAT attention score from node 4 to node 1256 is 0.1314
Average GAT attention score from node 4 to node 1761 is 0.1523


In [19]:
print("GAT Layer 2 Attention Scores:")
for i in range(15):
    print("Average GAT attention score from node {} to node {} is {:.4f}".format(edge_idx2[0,i], edge_idx2[1,i], alphas2[i,:].mean()))

GAT Layer 2 Attention Scores:
Average GAT attention score from node 0 to node 633 is 0.2662
Average GAT attention score from node 0 to node 1862 is 0.2137
Average GAT attention score from node 0 to node 2582 is 0.2581
Average GAT attention score from node 1 to node 2 is 0.1513
Average GAT attention score from node 1 to node 652 is 0.3237
Average GAT attention score from node 1 to node 654 is 0.5085
Average GAT attention score from node 2 to node 1 is 0.2778
Average GAT attention score from node 2 to node 332 is 0.1754
Average GAT attention score from node 2 to node 1454 is 0.5079
Average GAT attention score from node 2 to node 1666 is 0.1422
Average GAT attention score from node 2 to node 1986 is 0.0142
Average GAT attention score from node 3 to node 2544 is 0.4969
Average GAT attention score from node 4 to node 1016 is 0.1699
Average GAT attention score from node 4 to node 1256 is 0.1081
Average GAT attention score from node 4 to node 1761 is 0.1235


In [20]:
alphas1.shape

torch.Size([13264, 8])

Small Exercise: Find largest attention score, see if two nodes have the same class label

In [21]:
alphas1_average = alphas1.mean(dim=1)
alphas1_average.shape

torch.Size([13264])

In [22]:
torch.argmax(alphas1_average)

tensor(13061)

In [23]:
print("Average GAT attention score from node {} to node {} is {:.4f}".format(edge_idx1[0,1840], edge_idx1[1,1840], alphas1[1840,:].mean()))

Average GAT attention score from node 447 to node 2638 is 0.5437


In [24]:
print(cora_dataset.y[447])
print(cora_dataset.y[2638])

tensor(6)
tensor(6)


The high similarity score indeed indicated that the two nodes had the same class label. This means that when the GAT layer was computing attention between the node embeddings of nodes 447 and 2638, which were one-hot encoded words from scientific papers, the attention mechanism computed higher similarity between the word dictionary embeddings because the two papers had the same subject.

In [25]:
# Save training configuration and experiment description
with open(os.path.join(SAVE_PATH, 'config.json'), 'w', encoding='utf-8') as f:
    json.dump(args, f, ensure_ascii=False, indent=4)

# Print model definition
open(os.path.join(SAVE_PATH, "model_definition.txt"), 'a').close()
print(model)

GAT(
  (conv1): GATConv(1433, 8, heads=8)
  (conv2): GATConv(64, 7, heads=1)
  (act1): ELU(alpha=1.0)
  (drop1): Dropout(p=0.6, inplace=False)
  (drop2): Dropout(p=0.6, inplace=False)
)


Best performance: run6, 80.7% accuracy