### Cora dataset loader

I use the well-known pytorch geometric to load and pre-process the Cora dataset. 

The dataset consists of 2708 Machine Learning publications classified into seven classes (i.e., ML topic). 

The node representation is a bag-of-words feature vector of dimension 1433. 1433 is the size of a pre-built dictionary of unique words (collected from all papers in the dataset). Each dimension of the (normalized) feature vector indicates the absence/presence of the corresponding word from the dictionary. 

Further, two documents are connected if there exists a citation link between them.

In [56]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

print(f'==== Dataset info ====')
print(f'Dataset: {dataset}:')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of nodes = {data.num_nodes}')
print(f'Number of edges = {data.num_edges}')

==== Dataset info ====
Dataset: Cora():
Number of features: 1433
Number of classes: 7
Number of nodes = 2708
Number of edges = 10556


### Training a GNN on the node classification task
Following is the GCN model which is based on the GCNConv layer as defined in [this paper](https://arxiv.org/abs/1609.02907)

In [57]:
import torch
from torch_geometric.nn import GCNConv
from torch.nn import Linear
import torch.nn.functional as F


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(99)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(1433, 64)
  (conv2): GCNConv(64, 7)
)


##### Training
Next I train the model for 100 epochs using Adam, a cross-entropy loss function, and a hidden-layer of size 64 units.

In [58]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def trainGCN():
      model.train()
      optimizer.zero_grad() 
      out = model(data.x, data.edge_index) 
      loss = criterion(out[data.train_mask], data.y[data.train_mask]) 
      loss.backward()  
      optimizer.step() 
      return loss

for epoch in range(1, 101):
    loss = trainGCN()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.9459
Epoch: 002, Loss: 1.9349
Epoch: 003, Loss: 1.9220
Epoch: 004, Loss: 1.9034
Epoch: 005, Loss: 1.8840
Epoch: 006, Loss: 1.8627
Epoch: 007, Loss: 1.8372
Epoch: 008, Loss: 1.8158
Epoch: 009, Loss: 1.7804
Epoch: 010, Loss: 1.7525
Epoch: 011, Loss: 1.7291
Epoch: 012, Loss: 1.6872
Epoch: 013, Loss: 1.6664
Epoch: 014, Loss: 1.6238
Epoch: 015, Loss: 1.5680
Epoch: 016, Loss: 1.5415
Epoch: 017, Loss: 1.4953
Epoch: 018, Loss: 1.4602
Epoch: 019, Loss: 1.4221
Epoch: 020, Loss: 1.3935
Epoch: 021, Loss: 1.3299
Epoch: 022, Loss: 1.2709
Epoch: 023, Loss: 1.2322
Epoch: 024, Loss: 1.2015
Epoch: 025, Loss: 1.1567
Epoch: 026, Loss: 1.0978
Epoch: 027, Loss: 1.0719
Epoch: 028, Loss: 1.0344
Epoch: 029, Loss: 0.9860
Epoch: 030, Loss: 0.9684
Epoch: 031, Loss: 0.9192
Epoch: 032, Loss: 0.8783
Epoch: 033, Loss: 0.8499
Epoch: 034, Loss: 0.8134
Epoch: 035, Loss: 0.8067
Epoch: 036, Loss: 0.7590
Epoch: 037, Loss: 0.6944
Epoch: 038, Loss: 0.6844
Epoch: 039, Loss: 0.6591
Epoch: 040, Loss: 0.6249


### Check test accuracy on unseen data

In [59]:
def testGCN():
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)  
      test_correct = pred[data.test_mask] == data.y[data.test_mask] 
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

test_acc = testGCN()
print(f'Test Accuracy: {test_acc:.3f}')

Test Accuracy: 0.816


### Training a MLP
Here is a basic MLP baseline model to which we can compare the performance of GCN. Note that it only operates on input node features. 

GCNConv layers are replaced by Linear layers.

In [60]:
class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(99)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=64)
print(model)

MLP(
  (lin1): Linear(in_features=1433, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=7, bias=True)
)


#### Training

For fair comparison with the GCN model, I used the same optimizer and loss function as well as similar hyperparameters. 

In [61]:
model = MLP(hidden_channels=64)
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  

def trainMLP():
      model.train()
      optimizer.zero_grad()  
      out = model(data.x)  
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward()  
      optimizer.step()  
      return loss

for epoch in range(1, 101):
    loss = trainMLP()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.9487
Epoch: 002, Loss: 1.9428
Epoch: 003, Loss: 1.9370
Epoch: 004, Loss: 1.9252
Epoch: 005, Loss: 1.9165
Epoch: 006, Loss: 1.9018
Epoch: 007, Loss: 1.8876
Epoch: 008, Loss: 1.8699
Epoch: 009, Loss: 1.8528
Epoch: 010, Loss: 1.8347
Epoch: 011, Loss: 1.7980
Epoch: 012, Loss: 1.7851
Epoch: 013, Loss: 1.7552
Epoch: 014, Loss: 1.7110
Epoch: 015, Loss: 1.6771
Epoch: 016, Loss: 1.6514
Epoch: 017, Loss: 1.6263
Epoch: 018, Loss: 1.5816
Epoch: 019, Loss: 1.5288
Epoch: 020, Loss: 1.4921
Epoch: 021, Loss: 1.4151
Epoch: 022, Loss: 1.4242
Epoch: 023, Loss: 1.3401
Epoch: 024, Loss: 1.2995
Epoch: 025, Loss: 1.2481
Epoch: 026, Loss: 1.1687
Epoch: 027, Loss: 1.1455
Epoch: 028, Loss: 1.0847
Epoch: 029, Loss: 1.0007
Epoch: 030, Loss: 0.9831
Epoch: 031, Loss: 0.9263
Epoch: 032, Loss: 0.8724
Epoch: 033, Loss: 0.8298
Epoch: 034, Loss: 0.7901
Epoch: 035, Loss: 0.7319
Epoch: 036, Loss: 0.7021
Epoch: 037, Loss: 0.6447
Epoch: 038, Loss: 0.6146
Epoch: 039, Loss: 0.5788
Epoch: 040, Loss: 0.5327


##### MLP test accuracy 

In [62]:
def testMLP():
      model.eval()
      out = model(data.x)
      pred = out.argmax(dim=1)  
      test_correct = pred[data.test_mask] == data.y[data.test_mask] 
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
      return test_acc

test_acc = testMLP()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.5900


# Answering question #4

#### MLP vs. GCN model comparison 

GCN significantly outperformed MLP with more than 20% of classification accuracy. As expected, GCN are more suitable in handling graph data.

#### Training GCN with more dimensions/layers

In the following, I define GCN_plus which has an additional message passing layer compared to the previous GCN. I increased the dimension of the first hidden layer to 256 and set the second to 64. 

##### Explanations:
After running the new GCN_plus, we can observe that it learns faster than the previous GCN. GCN_plus needed only 20 epochs to train while GCN needed 100 (training objective was around 0.2 and test accuracy was similar in both cases, i.e, ~81.5%). However, training GCN_plus for 100 epochs has led to overfitting and test accuracy has dropped to ~79%. 

Cora dataset is relatively small (~2700 samples) and only 5% of it is used for training (140 samples). Hence no need for a deeper network to improve the classification performance.  


In [72]:
class GCN_plus(torch.nn.Module):
    def __init__(self, hidden_channels_1, hidden_channels_2):
        super().__init__()
        torch.manual_seed(99)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels_1)
        self.conv2 = GCNConv(hidden_channels_1, hidden_channels_2)
        self.conv3 = GCNConv(hidden_channels_2, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x, edge_index)
        return x

model = GCN_plus(hidden_channels_1=256, hidden_channels_2=64)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(1, 22):
    loss = trainGCN()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
test_acc = testGCN()
print(f'Test Accuracy: {test_acc:.3f}')

GCN_plus(
  (conv1): GCNConv(1433, 256)
  (conv2): GCNConv(256, 64)
  (conv3): GCNConv(64, 7)
)
Epoch: 001, Loss: 1.9461
Epoch: 002, Loss: 1.9366
Epoch: 003, Loss: 1.9192
Epoch: 004, Loss: 1.8829
Epoch: 005, Loss: 1.8501
Epoch: 006, Loss: 1.7981
Epoch: 007, Loss: 1.7434
Epoch: 008, Loss: 1.6468
Epoch: 009, Loss: 1.5595
Epoch: 010, Loss: 1.4614
Epoch: 011, Loss: 1.3250
Epoch: 012, Loss: 1.1640
Epoch: 013, Loss: 1.0587
Epoch: 014, Loss: 0.8973
Epoch: 015, Loss: 0.7660
Epoch: 016, Loss: 0.6445
Epoch: 017, Loss: 0.5792
Epoch: 018, Loss: 0.4742
Epoch: 019, Loss: 0.3859
Epoch: 020, Loss: 0.3351
Epoch: 021, Loss: 0.2629
Test Accuracy: 0.818
