# Graph Convolutional Network (GCN)

We demonstrated the use of GCN for node attribute inference on the CORA paper citation dataset.

**References**

[Semi-Supervised Classification with Graph Convolutional Networks](https://www.thejournal.club/c/paper/101516/), T. N. Kipf and M. Welling, ICLR 2017


Copyright 2010-2021 Commonwealth Scientific and Industrial Research Organisation (CSIRO).

All Rights Reserved.

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn import preprocessing, feature_extraction, model_selection

import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F

### Loading the CORA network

This time we are going to load the dataset from `dgl.data`.

In [None]:
cora_dataset = dgl.data.CoraGraphDataset()

First time you run the above command, DGL will download the dataset and store it locally.

The dataset comes with train, validation, and test splits.

In [None]:
type(cora_dataset)

In [None]:
# The dataset consists of a single graph
len(cora_dataset)

In [None]:
cora_dataset[0]

In [None]:
cora_graph = cora_dataset[0]

We can access node and edge data via the **`ndata`** and **`edata`** member variables.

In [None]:
cora_graph.ndata['train_mask'], len(cora_graph.ndata['train_mask'])

In [None]:
# node labels 1 of 7 in range [0, 6]
cora_graph.ndata['label'], np.unique(cora_graph.ndata['label'])

In [None]:
# node features
cora_graph.ndata['feat']

In [None]:
# As before, we have 2708 nodes with 1433-dimensional feature vector for each.
cora_graph.ndata['feat'].shape 

In [None]:
cora_graph

In [None]:
node_degrees = cora_graph.in_degrees()

In [None]:
node_degrees

In [None]:
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt

In [None]:
rcParams['figure.figsize'] = 7,5

In [None]:
sns.histplot(node_degrees);
plt.xlabel("Node degree");

In [None]:
node_degrees[node_degrees > 70]

### Splitting the data

The data is already split into train, validation and test sets. Let's check their sizes. The splits are from the paper [Revisiting Semi-Supervised Learning with Graph Embeddings
](https://www.thejournal.club/c/paper/90881/) by Z. Yang et.al., ICML 2016

In [None]:
print(f"Number of train examples     : {cora_graph.ndata['train_mask'].sum().item()}")
print(f"Number of validation examples: {cora_graph.ndata['val_mask'].sum().item()}")
print(f"Number of test examples      : {cora_graph.ndata['test_mask'].sum().item()}")

### Define the GCN model

In [None]:
# A graph convolutional layer as defined by Kipf and Welling.
from dgl.nn import GraphConv 

In [None]:
class GCN(nn.Module):
    def __init__(self, in_feats: int, h_feats: list[int], num_classes: int, dropout: float=0):
        """

        :param in_feats: <int> Dimensionality of node input features
        :param h_feats: <list> Dimensionality of hidden layers
        :param num_classes: <int> Number of output classes
        :param dropout: <float> The amount of dropout for all but the last
            layer. It should be a value in [0.0, 1.0]
        """
        super(GCN, self).__init__()

        self.dropout = dropout
        self.conv_layers = nn.ModuleList()

        self.conv_layers.append(GraphConv(in_feats, h_feats[0]))

        for i in range(1, len(h_feats)):
            self.conv_layers.append(GraphConv(h_feats[i-1], h_feats[i]))

        self.conv_layers.append(GraphConv(h_feats[-1], num_classes))

    def forward(self, g, in_feat):

        h = in_feat

        for i, layer in enumerate(self.conv_layers):
            h = layer(g, h)
            if i < len(self.conv_layers)-1:
                h = F.dropout(F.relu(h), p=self.dropout)

        return h


#### Instantiate a GCN model with 1 hidden graph convolutional layer and 1 output layer which will also be graph convolutional. The output node embeddings will be 16-dimensional as this is the number of neurons in the hidder layer. 

In [None]:
gcn_model = GCN(in_feats=cora_graph.ndata['feat'].shape[1], h_feats=[16], num_classes=cora_dataset.num_classes)

In [None]:
# Let's have a look at the model layers
gcn_model

In [None]:
def train(g, model, epochs=200, lr=0.01, weight_decay=0.0005, verbose=True):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    losses = {"train": [], "val": [], "test": []}
    accs = {"train": [], "val": [], "test": []}
    
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    for e in range(epochs):
        model.train()
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that we only need the loss over the nodes in the training set for
        # updating the model parameters but we compute it for the validation and test nodes
        # for reporting.
        loss = torch.nn.functional.cross_entropy(logits[train_mask], labels[train_mask])
        losses["train"].append(loss.item())
        losses["val"].append(torch.nn.functional.cross_entropy(logits[val_mask], labels[val_mask]).item())
        losses["test"].append(torch.nn.functional.cross_entropy(logits[test_mask], labels[test_mask]).item())
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        model.eval()
        # Compute accuracy on training/validation/test
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)
        
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        accs["train"].append(train_acc)
        accs["val"].append(val_acc)
        accs["test"].append(test_acc)
        
        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc
        
        
        if verbose and e % 10 == 0:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))
            
    return losses, accs, best_test_acc

In [None]:
gcn_losses, gcn_accs, best_test_acc = train(cora_graph, gcn_model, epochs=500, lr=0.01, weight_decay=0.0007)

In [None]:
plt.plot(gcn_losses["train"], label="train")
plt.plot(gcn_losses["val"], label="val")
plt.plot(gcn_losses["test"], label="test")
plt.title("GCN Loss")
plt.legend();

In [None]:
plt.plot(gcn_accs["train"], label="train")
plt.plot(gcn_accs["val"], label="val")
plt.plot(gcn_accs["test"], label="test")
plt.title("GCN Accuracy")
plt.legend();

### Accuracy as a function of number of GCN layers

In [None]:
max_layers = 12
gcn_models = []
for num_layers in range(1, max_layers+1):
    gcn_models.append(GCN(in_feats=cora_graph.ndata['feat'].shape[1], h_feats=[16]*num_layers, num_classes=cora_dataset.num_classes))

In [None]:
gcn_models[2]

In [None]:
test_accs = []
for model in gcn_models:
    gcn_losses, gcn_accs, best_test_acc = train(cora_graph, model, epochs=500, lr=0.01, weight_decay=0.0005, verbose=False)
    test_accs.append(best_test_acc.item()*100)

In [None]:
#test_accs = [acc*100 for acc in test_accs]


In [None]:
plt.plot(range(2, max_layers+1), test_accs[:-1])
plt.xlabel("Number of Layers")
plt.ylabel("Accuracy (%)")
# plt.plot(gcn_losses["val"], label="val")
# plt.plot(gcn_losses["test"], label="test")
plt.title("Test Accuracy")
#plt.legend();

### Visualise the embeddings

We are going to visualise the node representations as output by both the first and second graph convolutional layers.

In [None]:
gcn_model

In [None]:
graph_conv_layer = gcn_model.conv_layers[0]

In [None]:
node_embeddings_1 = graph_conv_layer(cora_graph, cora_graph.ndata["feat"]).detach().numpy()
node_embeddings_2 = gcn_model(cora_graph, cora_graph.ndata["feat"]).detach().numpy()
node_embeddings_1.shape, node_embeddings_2.shape

We see that there is one 16D vector output from the first convolutional layer and a 7D vector output from the second convolutional network for each of the 2708 nodes in the graph.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
transform = TSNE

trans = transform(n_components=2)
node_embeddings_transformed_1 = trans.fit_transform(node_embeddings_1)

trans = transform(n_components=2)
node_embeddings_transformed_2 = trans.fit_transform(node_embeddings_2)

In [None]:
node_embeddings_transformed_1.shape, node_embeddings_transformed_2.shape

In [None]:
# Helper method for plotting node embeddings
def plot_embeddings(node_embeddings, ax, node_labels, title, x_label="$X_1$", y_label="$X_2", alpha=0.7, figsize=(7,7)):
    ax.scatter(node_embeddings[:, 0], 
               node_embeddings[:, 1], 
               c=node_labels, 
               cmap="jet", alpha=alpha)
    ax.set(aspect="equal", xlabel=x_label, ylabel=y_label)
    ax.set(aspect="equal", xlabel=x_label, ylabel=y_label)
    ax.set_title(title)

In [None]:
fig, (ax_1, ax_2) = plt.subplots(1, 2, figsize=(15,15))
plot_embeddings(node_embeddings_transformed_1, 
                ax_1,
                cora_graph.ndata['label'], 
                title='Layer 1 embeddings', 
                x_label="$X_1$", 
                y_label="$X_2$", 
                alpha=0.7, 
                figsize=(8,8))
plot_embeddings(node_embeddings_transformed_2, 
                ax_2,
                cora_graph.ndata['label'], 
                title='Layer 2 embeddings', 
                x_label="$X_1$", 
                y_label="$X_2$", 
                alpha=0.7, 
                figsize=(8,8))

plt.show()

## Exercises

### 1. Can you improve classification performance?

Consider using more layers, wider layers, or both.

In [None]:
deep_gcn_model = GCN(in_feats=cora_graph.ndata['feat'].shape[1], 
                     h_feats=[16]*20,    # Specify 20 graph convolutional layers
                     num_classes=cora_dataset.num_classes)

In [None]:
deep_gcn_losses, deep_gcn_accs, deep_gcn_best_test_acc = train(cora_graph, deep_gcn_model, epochs=500, lr=0.01, weight_decay=0.0005)

In [None]:
plt.plot(deep_gcn_losses["train"], label="train")
plt.plot(deep_gcn_losses["val"], label="val")
plt.plot(deep_gcn_losses["test"], label="test")
plt.title("Deep GCN Loss")
plt.legend();

In [None]:
plt.plot(deep_gcn_accs["train"], label="train")
plt.plot(deep_gcn_accs["val"], label="val")
plt.plot(deep_gcn_accs["test"], label="test")
plt.title("GCN Accuracy")
plt.legend();

In [None]:
deep_node_embeddings = deep_gcn_model(cora_graph, cora_graph.ndata["feat"]).detach().numpy()
deep_node_embeddings.shape

In [None]:
trans = transform(n_components=2)
deep_node_embeddings_transformed = trans.fit_transform(deep_node_embeddings)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9,9))
plot_embeddings(deep_node_embeddings_transformed,
                ax,
                cora_graph.ndata['label'], 
                title='Visualization of Deep GCN embeddings for cora dataset', 
                x_label="$X_1$", 
                y_label="$X_2$", 
                alpha=0.7, 
                figsize=(8,8))
plt.show()

## Graph Attention Network (GAT)

We demonstrated the use of GAT for node attribute inference on the CORA paper citation dataset.

**References**

[Graph Attention Networks](https://www.thejournal.club/c/paper/134548/), P. Velickovic, G. Cucurull, A. Casanova, A. Romero, P. Lio, Y. Bengio, ICLR 2018




In [None]:
# A graph attention convolutional layer
from dgl.nn import GATConv

In [None]:
class GAT(nn.Module):
    def __init__(self, in_feats: int, h_feats: list[int], attention_heads: list[int], num_classes: int, 
                 feat_dropout :float=0, attention_dropout: float=0, concat_heads: bool=False):
        """

        :param in_feats: <int> Dimensionality of node input features.
        :param h_feats: <list> Dimensionality of hidden layers.
        :param attention_heads: <list> Number of attention heads for each layer.
        :param num_classes: <int> Number of output classes.
        :param feat_dropout: <float> The amount of dropout for layer input.
        :param attention_dropout: <float> The amount of dropout for attention coefficients.
        :param concat_heads: <bool> If True attention head outputs are concatenated or averaged if False.
        """
        super(GAT, self).__init__()
        
        self.concat_heads = concat_heads
        self.conv_layers = nn.ModuleList()

        self.conv_layers.append(GATConv(in_feats, 
                                        h_feats[0], 
                                        num_heads=attention_heads[0], 
                                        feat_drop=feat_dropout, 
                                        attn_drop=attention_dropout))

        for i in range(1, len(h_feats)):
            self.conv_layers.append(GATConv(h_feats[i-1]*attention_heads[-1] if concat_heads else h_feats[i-1], 
                                            h_feats[i], 
                                            num_heads=attention_heads[i], 
                                            feat_drop=feat_dropout,
                                            attn_drop=attention_dropout))

        self.conv_layers.append(GATConv(h_feats[-1]*attention_heads[-2] if concat_heads else h_feats[-1], 
                                        num_classes, 
                                        num_heads=attention_heads[-1],
                                        feat_drop=feat_dropout,
                                        attn_drop=attention_dropout))

    def forward(self, g, in_feat):

        h = in_feat

        for i, layer in enumerate(self.conv_layers):
            h = layer(g, h)  # output tensor is N, H, D_out where H is number of heads
            if self.concat_heads:
                h = h.reshape(h.shape[0], -1)
            else:
                # We are just going to average the node embeddings across attention heads
                h = h.mean(axis=-2)
                
            if i < len(self.conv_layers)-1:
                h = F.elu(h)                

        return h

In [None]:
gat_model = GAT(in_feats=cora_graph.ndata['feat'].shape[1], 
                h_feats=[8], # Dimensionality of node embeddings for each attention head
                attention_heads=[8,1], # Number of attention heads for each graph convolutional layer
                num_classes=cora_dataset.num_classes,
                feat_dropout=0.6,
                attention_dropout=0.6,
                concat_heads=True)

In [None]:
gat_model

In [None]:
gat_losses, gat_accs, gat_best_acc = train(cora_graph, gat_model, epochs=500, lr=0.005, weight_decay=0.0005)

In [None]:
plt.plot(gat_losses["train"], label="train")
plt.plot(gat_losses["val"], label="val")
plt.plot(gat_losses["test"], label="test")
plt.title("GAT Loss")
plt.legend();

In [None]:
plt.plot(gat_accs["train"], label="train")
plt.plot(gat_accs["val"], label="val")
plt.plot(gat_accs["test"], label="test")
plt.title("GAT Accuracy")
plt.legend();

## Exercises

\[1\] How does GAT performance change as a function of the GNNs depth? Does a GNN with GAT convolutionl layers suffer from the same oversmoothing problem as a deep GNN with GCN layers?

### GCNII

In [None]:
from dgl.nn import GCN2Conv
from torch.nn import Linear
import math

In [None]:
class GCNII(nn.Module):
    def __init__(self, in_feats: int, h_feats: list[int], num_classes: int, dropout: float=0):
        """

        :param in_feats: <int> Dimensionality of node input features
        :param h_feats: <list> Dimensionality of hidden layers; all hidden layers must have
            the same number of hidden units.
        :param num_classes: <int> Number of output classes
        :param dropout: <float> The amount of dropout for all but the last
            layer. It should be a value in [0.0, 1.0]
        """
        super(GCNII, self).__init__()

        if len(set(h_feats)) != 1:
            raise ValueError(f"All hidden layers must have the same number of hidden units but given {h_feats}")
        
        self.dropout = dropout        
        self.conv_layers = nn.ModuleList()

        self.linear = Linear(in_feats, h_feats[0])
        
        self.conv_layers.append(GCN2Conv(h_feats[0], h_feats[0]))

        for i in range(1, len(h_feats)):
            self.conv_layers.append(GCN2Conv(h_feats[i-1], h_feats[i]))

        self.conv_layers.append(GCN2Conv(h_feats[-1], num_classes))

    def reset_parameters(self):
        self.linear.reset_parameters()
        for layer in self.conv_layers:
            std = 1.0 / math.sqrt(layer._in_feats)
            torch.nn.init.uniform_(layer.weight1, -std, std)
    
    def forward(self, g, in_feat):

        h_0 = self.linear(F.dropout(in_feat, p=self.dropout))
        h = h_0
        
        for i, layer in enumerate(self.conv_layers):
            h = layer(g, F.dropout(h, p=self.dropout), h_0)
            if i < len(self.conv_layers)-1:
                h = F.relu(h)

        return h

In [None]:
gcn2_model = GCNII(in_feats=cora_graph.ndata['feat'].shape[1], 
                   h_feats=[16]*2,    # Specify 2 graph convolutional layers
                   dropout=0.2,
                   num_classes=cora_dataset.num_classes)

In [None]:
gcn2_model.reset_parameters()
gcn2_losses, gcn2_accs, best_gcn2_test_acc = train(cora_graph, gcn2_model, epochs=500, lr=0.01, weight_decay=0.000)

In [None]:
gcn2_model = GCNII(in_feats=cora_graph.ndata['feat'].shape[1], 
                   h_feats=[16]*32,    # Specify 32 graph convolutional layers
                   dropout=0.2,
                   num_classes=cora_dataset.num_classes)

In [None]:
gcn2_model.reset_parameters()
gcn2_losses, gcn2_accs, best_gcn2_test_acc = train(cora_graph, gcn2_model, epochs=500, lr=0.01, weight_decay=0.000)