<a href="https://colab.research.google.com/github/QingfangLiu/DS_learning/blob/main/GNN_learning_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# check where this notebook is running 
# should be a virtual environment 'pyg_env' for running GNN on local
import sys
print(sys.executable)

/usr/local/bin/python3


In [2]:
# check numpy version (needs to be NumPy 1.x)
import numpy as np
print(np.__version__)

2.1.3


In [6]:
import torch
import torch_geometric
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid

In [50]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [51]:
dataset

Cora()

In [78]:
len(dataset)

1

In [54]:
dataset.num_classes

7

In [77]:
dataset[0].is_undirected()

True

In [65]:
dataset[0].keys() # lists actual data fields stored in the object

['x', 'edge_index', 'y', 'test_mask', 'val_mask', 'train_mask']

In [69]:
# full list of data properties, computed on the fly
print(dir(dataset[0]))

['__abstractmethods__', '__annotations__', '__call__', '__cat_dim__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__inc__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_edge_attr_cls', '_edge_to_layout', '_edges_to_layout', '_get_edge_index', '_get_tensor', '_get_tensor_size', '_multi_get_tensor', '_put_edge_index', '_put_tensor', '_remove_edge_index', '_remove_tensor', '_store', '_tensor_attr_cls', '_to_type', 'apply', 'apply_', 'batch', 'clone', 'coalesce', 'concat', 'contains_isolated_nodes', 'contains_self_loops', 'contiguous', 'coo', 'cpu', 'csc'

In [67]:
# check some data properties, computed on the fly
print('# of node features:',dataset[0].num_node_features)
print('# of nodes:',dataset[0].num_nodes)
print('# of edges:',dataset[0].num_edges)

# of node features: 1433
# of nodes: 2708
# of edges: 10556


In [53]:
dataset[0].edge_index.shape

torch.Size([2, 10556])

In [73]:
dataset[0].edge_index  # graph conn in COO format, 2 * # edges (row1: source nodes; row2: target nodes)

tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])

In [75]:
dataset[0].x.shape

torch.Size([2708, 1433])

In [61]:
dataset[0].train_mask.shape

torch.Size([2708])

In [57]:
class GCN(torch.nn.Module): # this is a 2-layer GCN model for node classification
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16) # graph convolutional layer
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training) # apply dropout for regularization during training
        x = self.conv2(x, edge_index) # map to class scores

        return F.log_softmax(x, dim=1) # output log-prob for each node

In [59]:
# training for 200 epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device) # initialize model and move it to the selected device
data = dataset[0].to(device) # load the 1st graph from the dataset and move to the same device
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad() # clear previous gradients
    out = model(data) # forward pass
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask]) # compute loss only on training nodes using train_mask
    loss.backward() # backpropagation
    optimizer.step() # update weights

In [81]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8020
