This work is highly inspired in the following repositories: 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/ogbn_products_sage.py

https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python/blob/main/Chapter08/chapter8.ipynb 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py


# Imports

In [1]:
import torch
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader
import time
from sklearn.metrics import f1_score 

In [2]:
if torch.backends.mps.is_available() and False:
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    device = torch.device('cpu')

# Data: CORA - small version 

In [3]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]
data = data.to(device)

## Visualize Information from graph

In [None]:
# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Training nodes: {sum(data.train_mask).item()}')
print(f'Evaluation nodes: {sum(data.val_mask).item()}')
print(f'Test nodes: {sum(data.test_mask).item()}')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

# Hyperparameters

In [4]:
learning_rate = 0.01 # variable to change/play around with for experiments --> 0.0001
epochs = 10
aggregator = 'mean' # variable to change/play around with for experiments
dropout_rate = 0.4
normalization = True
activation_function = True
bias = True
batch =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128

# NeighborLoader

In [5]:
train_loader = NeighborLoader(data, 
                            input_nodes=data.train_mask, # ensure that the sampling only happens in the training set 
                            batch_size = batch,
                            num_neighbors=[neighborhood_1, neighborhood_2], 
                            shuffle=True)

## Visualize Information from sampling/subgraphs

In [None]:
# Print each subgraph
for i, subgraph in enumerate(train_loader):
    print(f'Subgraph {i}: {subgraph}')

# Make model for Link prediction

Initialization:
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.SAGEConv.html

https://medium.com/@juyi.lin/neighborloader-introduction-ccb870cc7294

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python/blob/main/Chapter08/chapter8.ipynb

In [6]:
def accuracy(pred_y, y): 
    return ((pred_y == y).sum()/len(y)).item()

In [13]:
class GraphSAGE_local(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, aggr='mean', normalization = True, activation_function = True, bias = True):

        super().__init__()
        # as K = 2, we have 2 layers
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, out_channels = hidden_channels, project = activation_function, bias = bias)
        self.conv2 = SAGEConv(hidden_channels, out_channels = out_channels, project = activation_function, bias = bias, normalization = normalization)
    

    def forward(self, matrix_nodes_features, edge_index):
      # matrix_nodes_features is a matrix from the data where row = nodes, columns = feature
      # edge_index: This is a tensor that describes the connectivity of the graph. Each column in this matrix represents an edge. The first row contains the indices of the source nodes, and the second row contains the indices of the target nodes.
    
        h = self.conv1(matrix_nodes_features, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=self.dropout, training = self.training)

        h = self.conv2(h, edge_index)
        h = F.relu(h) # TODO: maybe remove this 
        h = F.dropout(h,  p=self.dropout, training = self.training) # TODO: maybe remove this
        h = F.log_softmax(h, dim = 1)
        return h
    
    def fit(self, loader, epochs):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        criterion = torch.nn.CrossEntropyLoss()
        self.train()
        times = []

        for epoch in range(epochs+1):
            start = time.time()
            train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0 
            
            for batch in loader:
                batch = batch.to(device)
                optimizer.zero_grad()
                out = self(batch.x, batch.edge_index) # obtain the probability of belonging to each class or label for each node 
                
                loss = criterion(out[batch.train_mask],  batch.y[batch.train_mask]) 
                
                # Train data
                train_loss += loss.item()
                train_acc += accuracy(out[batch.train_mask].argmax(dim = 1), batch.y[batch.train_mask])

                loss.backward()
                optimizer.step()

                # Validation data
                val_loss += criterion(out[batch.val_mask], batch.y[batch.val_mask])
                val_acc += accuracy((out[batch.val_mask]).argmax(dim = 1), batch.y[batch.val_mask]) 

            # All following values are average per batch 
            print(f'Epoch {epoch:>3} | Train Loss: {loss/len(loader):.3f} | Train Acc: {train_acc/len(loader)*100:>6.2f}% | Val Loss: {val_loss/len(train_loader):.2f} | Val Acc: {val_acc/len(train_loader)*100:.2f}%')
          
            times.append(time.time() - start)
        print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")
        
    @torch.no_grad()
    def test(self, data):
        self.eval()
        out = self(data.x, data.edge_index)
        y = data.y[data.test_mask]
        y_prediction = out.argmax(dim = 1)[data.test_mask]

        acc = accuracy(y_prediction, y)
        f1_macro = f1_score(y, y_prediction, average = 'macro')
        f1_micro =  f1_score(y, y_prediction, average = 'micro')
        return acc, f1_macro, f1_micro

# Test method
The test function evaluates the model's performance on unseen data. 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

# Running code

# 1. Create model for node classification


In [14]:
labels_all_nodes = data.y
number_classes  = labels_all_nodes.unique().size(0)

model = GraphSAGE_local(in_channels = data.num_node_features,
                  hidden_channels= embedding_dimension,
                  out_channels = number_classes,
                  dropout= dropout_rate,
                  aggr = aggregator,
                  normalization = normalization,
                  activation_function = activation_function,
                  bias = bias)
model.to(device)
print(model)

GraphSAGE_local(
  (conv1): SAGEConv(1433, 128, aggr=mean)
  (conv2): SAGEConv(128, 7, aggr=mean)
)


# 2. Training mdel for certain number of epochs and testing it 

In [15]:
model.fit(train_loader, epochs)


Epoch   0 | Train Loss: 1.948 | Train Acc:  12.14% | Val Loss: 1.95 | Val Acc: 16.02%
Epoch   1 | Train Loss: 1.920 | Train Acc:  16.43% | Val Loss: 1.94 | Val Acc: 23.48%
Epoch   2 | Train Loss: 1.828 | Train Acc:  25.00% | Val Loss: 1.90 | Val Acc: 20.43%
Epoch   3 | Train Loss: 1.753 | Train Acc:  26.43% | Val Loss: 1.86 | Val Acc: 25.54%
Epoch   4 | Train Loss: 1.730 | Train Acc:  36.43% | Val Loss: 1.87 | Val Acc: 28.70%
Epoch   5 | Train Loss: 1.406 | Train Acc:  46.43% | Val Loss: 1.82 | Val Acc: 33.33%
Epoch   6 | Train Loss: 1.400 | Train Acc:  40.71% | Val Loss: 1.80 | Val Acc: 32.89%
Epoch   7 | Train Loss: 1.207 | Train Acc:  48.57% | Val Loss: 1.73 | Val Acc: 39.22%
Epoch   8 | Train Loss: 1.210 | Train Acc:  49.29% | Val Loss: 1.68 | Val Acc: 41.49%
Epoch   9 | Train Loss: 1.275 | Train Acc:  41.43% | Val Loss: 1.69 | Val Acc: 37.87%
Epoch  10 | Train Loss: 1.009 | Train Acc:  54.29% | Val Loss: 1.61 | Val Acc: 41.70%
Median time per epoch: 0.0339s


## 3. Calculate accuracy on test data


In [17]:
acc, f1_macro, f1_micro = model.test(data)
print(f'Model accuarcy: {acc*100:.2f}%' )
print(f'Model f1 Macro: {f1_macro:.2f}%' )
print(f'Model f1 Micro: {f1_micro:.2f}%' )

Model accuarcy: 59.50%
Model f1 Macro: 0.47%
Model f1 Micro: 0.59%
