# Semi-Supervised Classification with Graph Convolutional Network Implementation

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader 



In [3]:
!wget https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
!tar -xvzf cora.tgz
!rm -r cora.tgz


--2024-07-22 09:27:35--  https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
Resolving linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)... 128.114.47.74
Connecting to linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)|128.114.47.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 168052 (164K) [application/x-gzip]
Saving to: ‘cora.tgz’


2024-07-22 09:27:41 (154 KB/s) - ‘cora.tgz’ saved [168052/168052]

cora/
cora/README
cora/cora.cites
cora/cora.content


In [4]:
# Load the dataset
content = pd.read_csv('cora/cora.content', sep='\t', header=None)
cites = pd.read_csv('cora/cora.cites', sep='\t', header=None)

In [5]:
content.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
5,1126012,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Probabilistic_Methods
6,1107140,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Theory
7,1102850,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
8,31349,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
9,1106418,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Theory


In [6]:
cites.head()

Unnamed: 0,0,1
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960


In [7]:
features = torch.tensor(content.iloc[:,1:-1].values, dtype=torch.float32)
labels = content.iloc[:,-1].values

In [8]:
print(labels)

['Neural_Networks' 'Rule_Learning' 'Reinforcement_Learning' ...
 'Genetic_Algorithms' 'Case_Based' 'Neural_Networks']


In [9]:
print(pd.unique(labels))

['Neural_Networks' 'Rule_Learning' 'Reinforcement_Learning'
 'Probabilistic_Methods' 'Theory' 'Genetic_Algorithms' 'Case_Based']


In [10]:
print(features.shape)

torch.Size([2708, 1433])


In [11]:
from sklearn.preprocessing import LabelEncoder
# Encode labels
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
tensor_encoded_labels = torch.tensor(encoded_labels, dtype=torch.long)

In [12]:
print(type(encoded_labels))
print(encoded_labels)
print(tensor_encoded_labels)
# To know which label is mapped to integers
print(le.classes_)
mapped_levels = dict(zip(le.classes_, range(len(le.classes_))))
print(mapped_levels)

<class 'numpy.ndarray'>
[2 5 4 ... 1 0 2]
tensor([2, 5, 4,  ..., 1, 0, 2])
['Case_Based' 'Genetic_Algorithms' 'Neural_Networks'
 'Probabilistic_Methods' 'Reinforcement_Learning' 'Rule_Learning' 'Theory']
{'Case_Based': 0, 'Genetic_Algorithms': 1, 'Neural_Networks': 2, 'Probabilistic_Methods': 3, 'Reinforcement_Learning': 4, 'Rule_Learning': 5, 'Theory': 6}


In [13]:
# Create adjacency matrix
idx = torch.tensor(content[0].values, dtype = torch.long)
print(idx)
edge_index = torch.tensor(cites.values, dtype = torch.long).t()
print(edge_index)

num_nodes = features.shape[0]
print(num_nodes)

adj_matrix = torch.zeros(num_nodes, num_nodes)
print(adj_matrix.shape)

tensor([  31336, 1061127, 1106406,  ..., 1128978,  117328,   24043])
tensor([[     35,      35,      35,  ...,  853118,  853155,  954315],
        [   1033,  103482,  103515,  ..., 1140289,  853118, 1155073]])
2708
torch.Size([2708, 2708])


In [14]:
print(content[0].values)

[  31336 1061127 1106406 ... 1128978  117328   24043]


In [15]:
node_ids = content[0].values
node_id_map = {node_id: i for i, node_id in enumerate(node_ids)}
edge_index = cites.map(node_id_map.get).values.T
edge_index = torch.tensor(edge_index, dtype=torch.long)

# Create adjacency matrix
num_nodes = features.shape[0]
adj_matrix = torch.zeros((num_nodes, num_nodes), dtype=torch.float32)
# For undirected
# adj_matrix[edge_index[0], edge_index[1]] = 1
# adj_matrix[edge_index[1], edge_index[0]] = 1  

# For directed graph
"""
If a line is represented by "paper1 paper2" then the link is "paper2->paper1". Source README.md 
"""
adj_matrix[edge_index[1], edge_index[0]] = 1

In [16]:
print(edge_index)
print(adj_matrix)

tensor([[ 163,  163,  163,  ..., 1887, 1902,  837],
        [ 402,  659, 1696,  ..., 2258, 1887, 1686]])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [17]:
adj_matrix = adj_matrix + torch.eye(num_nodes)

# Normalize adjacency matrix
deg = torch.diag(torch.sum(adj_matrix, dim=1))
adj_matrix = torch.inverse(deg) @ adj_matrix

In [18]:
from sklearn.model_selection import train_test_split
# Split the data
train_indices, test_indices = train_test_split(range(len(labels)), test_size=0.2, random_state=42)
train_indices, val_indices = train_test_split(train_indices, test_size=0.1, random_state=42)


train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True

print(f'Features shape: {features.shape}')
print(f'Labels shape: {labels.shape}')
print(f'Adjacency matrix shape: {adj_matrix.shape}')

Features shape: torch.Size([2708, 1433])
Labels shape: (2708,)
Adjacency matrix shape: torch.Size([2708, 2708])


In [19]:
class GCNLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_features, out_features)

    def forward(self, x, adj):
        x = self.linear(x)
        x = torch.mm(adj, x)
        return x

class GCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super(GCN, self).__init__()
        self.conv1 = GCNLayer(in_features, hidden_features)
        self.conv2 = GCNLayer(hidden_features, out_features)

    def forward(self, x, adj):
        x = self.conv1(x, adj)
        x = F.relu(x)
        x = self.conv2(x, adj)
        return F.log_softmax(x, dim=1)

In [20]:
model = GCN(features.shape[1], 16, len(le.classes_))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    output = model(features, adj_matrix)
    loss = F.nll_loss(output[train_mask], tensor_encoded_labels[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    output = model(features, adj_matrix)
    preds = output.argmax(dim=1)
    train_correct = preds[train_mask] == tensor_encoded_labels[train_mask]
    val_correct = preds[val_mask] == tensor_encoded_labels[val_mask]
    test_correct = preds[test_mask] == tensor_encoded_labels[test_mask]
    
    train_acc = train_correct.sum().item() / train_mask.sum().item()
    val_acc = val_correct.sum().item() / val_mask.sum().item()
    test_acc = test_correct.sum().item() / test_mask.sum().item()
    
    return train_acc, val_acc, test_acc

In [21]:

for epoch in range(200):
    loss = train()
    train_acc, val_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 000, Loss: 1.9253, Train Acc: 0.4161, Val Acc: 0.4240, Test Acc: 0.4373
Epoch: 001, Loss: 1.8598, Train Acc: 0.4520, Val Acc: 0.4562, Test Acc: 0.4539
Epoch: 002, Loss: 1.7820, Train Acc: 0.4813, Val Acc: 0.4839, Test Acc: 0.4779
Epoch: 003, Loss: 1.6954, Train Acc: 0.5003, Val Acc: 0.4977, Test Acc: 0.4889
Epoch: 004, Loss: 1.6063, Train Acc: 0.5146, Val Acc: 0.5069, Test Acc: 0.5074
Epoch: 005, Loss: 1.5167, Train Acc: 0.5434, Val Acc: 0.5207, Test Acc: 0.5351
Epoch: 006, Loss: 1.4265, Train Acc: 0.5798, Val Acc: 0.5438, Test Acc: 0.5590
Epoch: 007, Loss: 1.3367, Train Acc: 0.6198, Val Acc: 0.5668, Test Acc: 0.6107
Epoch: 008, Loss: 1.2482, Train Acc: 0.6752, Val Acc: 0.6498, Test Acc: 0.6605
Epoch: 009, Loss: 1.1608, Train Acc: 0.7245, Val Acc: 0.6912, Test Acc: 0.6937
Epoch: 010, Loss: 1.0744, Train Acc: 0.7737, Val Acc: 0.7189, Test Acc: 0.7306
Epoch: 011, Loss: 0.9899, Train Acc: 0.8061, Val Acc: 0.7558, Test Acc: 0.7620
Epoch: 012, Loss: 0.9086, Train Acc: 0.8338, Val Acc