This work is inspired/based in the following repositories: 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python/blob/main/Chapter10/chapter10.ipynb

In [13]:
import torch
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch_geometric.loader import LinkNeighborLoader
import copy
from tqdm import tqdm 
from sklearn.linear_model import LogisticRegression
import time
from torch_geometric.transforms import RandomLinkSplit

# Local 
import graph_handler
import graph_sage_node_classification
from k_cross_validation import k_fold

In [14]:
if torch.backends.mps.is_available() and False:
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    device = torch.device('cpu')

# Read in Data: CORA - small version 

In [37]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]
data = data.to(device)

## Visualize Information from graph

In [16]:
graph_handler.visualize_information_graph(dataset)

Dataset: Cora()
-------------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7

Graph:
------
Training nodes: 140
Evaluation nodes: 500
Test nodes: 1000
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


In [39]:
data.num_nodes

2708

# Hyperparameters

In [17]:
# PARAMETERS TO CHANGE FOR EXPERIMENTS 
learning_rate = 0.0001
aggregator = 'mean'

# FIXED PARAMETERS
epochs = 10
dropout_rate = 0.4
normalization = True
activation_function = True
bias = True
batch =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
k = 5 # k-cross validation

# Training model and obtaining results 

In [18]:
# Store the metrics for each fold
micro_f1_scores, macro_f1_scores, accuracy_scores = [], [], []

# Split data 
https://medium.com/stanford-cs224w/a-tour-of-pygs-data-loaders-9f2384e48f8f

https://zqfang.github.io/2021-08-12-graph-linkpredict/ --> trasductie disjoint mode 

In [19]:
transform = RandomLinkSplit(
    num_val=0.2,  # fraction of data held out for validation
    num_test=0.2, # fraction of data held out for test
    is_undirected= True, 
    add_negative_train_samples= True, 
    neg_sampling_ratio= 1.0, # Need to include negative sampling edges, the edges not existed in the original graph.
    key = 'edge_label', 
    disjoint_train_ratio = 0.9
)
train_data, val_data, test_data = transform(data)
train_data, val_data, test_data =  train_data.to(device), val_data.to(device), test_data.to(device)

# NeighborLoader

In [20]:
train_loader = LinkNeighborLoader(train_data, 
                            num_neighbors=[neighborhood_1, neighborhood_2],
                            shuffle=True,
                            neg_sampling_ratio= 1.0, 
                            batch_size = batch)

## Visualize Information from sampling/subgraphs

In [None]:
# Print each subgraph
for i, subgraph in enumerate(train_loader):
    print(f'Subgraph {i}: {subgraph}')

# Make model for Link prediction

Initialization:
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.SAGEConv.html

https://medium.com/@juyi.lin/neighborloader-introduction-ccb870cc7294

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [21]:
class GraphSAGE_local(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, aggr='mean', normalization = True, activation_function = True, bias = True):

        super().__init__()
        # as K = 2, we have 2 layers
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, out_channels = hidden_channels, project = activation_function, bias = bias)
        self.conv2 = SAGEConv(hidden_channels, out_channels = out_channels, project = activation_function, bias = bias, normalize = normalization)
        #self.conv3 = SAGEConv(128, out_channels = out_channels, normalize = normalization, project = activation_function, bias = bias)

    def forward(self, matrix_nodes_features, edge_index):
      # matrix_nodes_features is a matrix from the data where row = nodes, columns = feature
      # edge_index: This is a tensor that describes the connectivity of the graph. Each column in this matrix represents an edge. The first row contains the indices of the source nodes, and the second row contains the indices of the target nodes.
    
        x = self.conv1(matrix_nodes_features, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training = self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training = self.training)

        #x = self.conv3(x, edge_index)
        #x = F.relu(x)
        #x = F.dropout(x, p=self.dropout)

        # returning only x allows to apply any task-specific post-processing (in this case: link prediction and node classification)
        return x 
    

# Train model with unsupervised loss 
Where model is trainned to fit training data 
 The function iterates over batches of data from train_loader. Each batch contains a subset of the entire training dataset
 
 For each batch, the model computes the node embeddings h, then calculates the embeddings for the source h_src and destination h_dst nodes of each edge. It then predicts whether an edge should exist between node pairs (pred)

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [30]:
def train(model, optimizer, train_loader, device):
    model.train() # set model into training mode, doesnt do anything else 
    
    total_loss = 0
    
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        h = model(data.x, data.edge_index)
        h_src = h[data.edge_label_index[0]]
        h_dst = h[data.edge_label_index[1]]
        
        pred = (h_src * h_dst).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, data.edge_label)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.size(0)

    return total_loss / len(train_loader)


# Test method
The test function evaluates the model's performance on unseen data. 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [31]:
@torch.no_grad()
def test(model, data):
    model.eval()
    out = model(data.x, data.edge_index)

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    train_acc = clf.score(out[data.train_mask], data.y[data.train_mask]) 
    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask]) 
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])
    return train_acc, val_acc, test_acc

# TODO: this test maybe should return other things 

# Running code

## 1. Create GraphSAGE model for link prediction

In [32]:
model = GraphSAGE_local(in_channels = data.num_node_features,
                  hidden_channels= data.num_node_features,
                  out_channels = embedding_dimension,
                  dropout= dropout_rate,
                  aggr = aggregator,
                  normalization = normalization,
                  activation_function = activation_function,
                  bias = bias)

model = model.to(device)
print(model)

GraphSAGE_local(
  (conv1): SAGEConv(1433, 1433, aggr=mean)
  (conv2): SAGEConv(1433, 128, aggr=mean)
)


## 2. Training model for certain number for epochs and testing it 

In [33]:
times = []
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs+1):
    start = time.time()

    loss = train(model, optimizer, train_loader, device)

    train_acc, val_acc, test_acc = test(model, data)
    
    print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {train_acc*100:>6.2f}% | Val Acc: {val_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%')
    
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")


Epoch   0 | Train Loss: 441.883 | Train Acc:  82.14% | Val Acc: 32.40% | Test Acc: 31.60%
Epoch   1 | Train Loss: 410.926 | Train Acc:  81.43% | Val Acc: 34.20% | Test Acc: 33.70%
Epoch   2 | Train Loss: 393.371 | Train Acc:  83.57% | Val Acc: 34.80% | Test Acc: 35.30%
Epoch   3 | Train Loss: 370.712 | Train Acc:  83.57% | Val Acc: 35.40% | Test Acc: 37.80%
Epoch   4 | Train Loss: 361.608 | Train Acc:  85.00% | Val Acc: 34.40% | Test Acc: 38.40%
Epoch   5 | Train Loss: 352.054 | Train Acc:  85.00% | Val Acc: 36.40% | Test Acc: 39.80%
Epoch   6 | Train Loss: 343.511 | Train Acc:  84.29% | Val Acc: 36.40% | Test Acc: 41.00%
Epoch   7 | Train Loss: 340.351 | Train Acc:  83.57% | Val Acc: 38.00% | Test Acc: 41.40%
Epoch   8 | Train Loss: 340.431 | Train Acc:  84.29% | Val Acc: 38.20% | Test Acc: 42.50%
Epoch   9 | Train Loss: 335.183 | Train Acc:  85.00% | Val Acc: 38.20% | Test Acc: 43.30%
Epoch  10 | Train Loss: 337.397 | Train Acc:  86.43% | Val Acc: 37.40% | Test Acc: 43.70%
Median tim

Data(x=[2423, 1433], edge_index=[2, 7907], y=[2423], train_mask=[2423], val_mask=[2423], test_mask=[2423], n_id=[2423], e_id=[7907], input_id=[256], edge_label_index=[2, 512], edge_label=[512])
