https://medium.com/analytics-vidhya/ohmygraphs-graphsage-in-pyg-598b5ec77e7b

In [1]:
import torch
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch_geometric.loader import LinkNeighborLoader
import copy
from tqdm import tqdm 
from sklearn.linear_model import LogisticRegression
import time
from torch_geometric.transforms import RandomLinkSplit


In [2]:
if torch.backends.mps.is_available() and False:
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    device = torch.device('cpu')

# Data: CORA - small version 

In [3]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]
data = data.to(device)


## Visualize Information from graph

In [None]:
# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Training nodes: {sum(data.train_mask).item()}')
print(f'Evaluation nodes: {sum(data.val_mask).item()}')
print(f'Test nodes: {sum(data.test_mask).item()}')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

# Hyperparameters

In [4]:
learning_rate = 0.0001 # variable to change/play around with for experiments
epochs = 10
aggregator = 'mean' # variable to change/play around with for experiments
dropout_rate = 0.4
normalization = True
activation_function = True
bias = True
batch =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128

# Split data 
https://medium.com/stanford-cs224w/a-tour-of-pygs-data-loaders-9f2384e48f8f

https://zqfang.github.io/2021-08-12-graph-linkpredict/ --> trasductie disjoint mode 

In [5]:
transform = RandomLinkSplit(
    num_val=0.2,  # fraction of data held out for validation
    num_test=0.2, # fraction of data held out for test
    is_undirected= True, 
    add_negative_train_samples= True, 
    neg_sampling_ratio= 1.0, # Need to include negative sampling edges, the edges not existed in the original graph.
    key = 'edge_label', 
    disjoint_train_ratio = 0.9
)
train_data, val_data, test_data = transform(data)
train_data, val_data, test_data =  train_data.to(device), val_data.to(device), test_data.to(device)

# NeighborLoader

In [6]:
train_loader = LinkNeighborLoader(train_data, 
                            num_neighbors=[neighborhood_1, neighborhood_2],
                            shuffle=True,
                            neg_sampling_ratio= 1.0, 
                            batch_size = batch)

## Visualize Information from sampling/subgraphs

In [None]:
# Print each subgraph
for i, subgraph in enumerate(train_loader):
    print(f'Subgraph {i}: {subgraph}')

# Make model for Link prediction

Initialization:
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.SAGEConv.html

https://medium.com/@juyi.lin/neighborloader-introduction-ccb870cc7294

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [7]:
class GraphSAGE_local(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, aggr='mean', normalization = True, activation_function = True, bias = True):

        super().__init__()
        # as K = 2, we have 2 layers
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, out_channels = hidden_channels, project = activation_function, bias = bias)
        self.conv2 = SAGEConv(hidden_channels, out_channels = out_channels, project = activation_function, bias = bias, normalize = normalization)
        #self.conv3 = SAGEConv(128, out_channels = out_channels, normalize = normalization, project = activation_function, bias = bias)

    def forward(self, matrix_nodes_features, edge_index):
      # matrix_nodes_features is a matrix from the data where row = nodes, columns = feature
      # edge_index: This is a tensor that describes the connectivity of the graph. Each column in this matrix represents an edge. The first row contains the indices of the source nodes, and the second row contains the indices of the target nodes.
    
        x = self.conv1(matrix_nodes_features, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training = self.training)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training = self.training)

        #x = self.conv3(x, edge_index)
        #x = F.relu(x)
        #x = F.dropout(x, p=self.dropout)

        # returning only x allows to apply any task-specific post-processing (in this case: link prediction and node classification)
        return x 
    

# Train model with unsupervised loss 
Where model is trainned to fit training data 
 The function iterates over batches of data from train_loader. Each batch contains a subset of the entire training dataset
 
 For each batch, the model computes the node embeddings h, then calculates the embeddings for the source h_src and destination h_dst nodes of each edge. It then predicts whether an edge should exist between node pairs (pred)

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [8]:
def train(model, optimizer, train_loader, device):
    model.train() # set model into training mode, doesnt do anything else 
    
    total_loss = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        h = model(batch.x, batch.edge_index)
        h_src = h[batch.edge_label_index[0]]
        h_dst = h[batch.edge_label_index[1]]
        
        pred = (h_src * h_dst).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, batch.edge_label)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.size(0)

    return total_loss / len(train_loader)


# Test method
The test function evaluates the model's performance on unseen data. 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [9]:
@torch.no_grad()
def test(model, data):
    model.eval()
    out = model(data.x, data.edge_index)
    
    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    train_acc = clf.score(out[data.train_mask], data.y[data.train_mask]) 
    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask]) 
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])
    return train_acc, val_acc, test_acc

# TODO: this test maybe should return other things 

# Running code

## 1. Create GraphSAGE model for link prediction

In [10]:
model = GraphSAGE_local(in_channels = data.num_node_features,
                  hidden_channels= data.num_node_features,
                  out_channels = embedding_dimension, # TODO: check results using 128 instead 
                  dropout= dropout_rate,
                  aggr = aggregator,
                  normalization = normalization,
                  activation_function = activation_function,
                  bias = bias)

model = model.to(device)
print(model)

GraphSAGE_local(
  (conv1): SAGEConv(1433, 1433, aggr=mean)
  (conv2): SAGEConv(1433, 128, aggr=mean)
)


## 2. Training model for certain number for epochs and testing it 

In [11]:
times = []
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs+1):
    start = time.time()
    loss = train(model, optimizer, train_loader, device)
    train_acc, val_acc, test_acc = test(model, data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
          f'Val: {val_acc:.4f}, Train: {train_acc:.4f}, Test: {test_acc:.4f}')
    
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")


Epoch: 000, Loss: 440.5940, Val: 0.3180, Train: 0.7357, Test: 0.3190
Epoch: 001, Loss: 408.4323, Val: 0.3300, Train: 0.7786, Test: 0.3260
Epoch: 002, Loss: 387.4183, Val: 0.3500, Train: 0.8214, Test: 0.3320
Epoch: 003, Loss: 372.9583, Val: 0.3440, Train: 0.8429, Test: 0.3400
Epoch: 004, Loss: 358.2499, Val: 0.3520, Train: 0.8357, Test: 0.3560
Epoch: 005, Loss: 352.2124, Val: 0.3740, Train: 0.8500, Test: 0.3640
Epoch: 006, Loss: 343.6900, Val: 0.3760, Train: 0.8643, Test: 0.3590
Epoch: 007, Loss: 339.8025, Val: 0.3780, Train: 0.8643, Test: 0.3570
Epoch: 008, Loss: 336.5369, Val: 0.3860, Train: 0.8571, Test: 0.3580
Epoch: 009, Loss: 337.3190, Val: 0.3880, Train: 0.8500, Test: 0.3550
Epoch: 010, Loss: 338.2898, Val: 0.4100, Train: 0.8500, Test: 0.3610
Median time per epoch: 0.3126s


Data(x=[2423, 1433], edge_index=[2, 7907], y=[2423], train_mask=[2423], val_mask=[2423], test_mask=[2423], n_id=[2423], e_id=[7907], input_id=[256], edge_label_index=[2, 512], edge_label=[512])


# Get results for test data

In [12]:
model.eval()
out = model(data.x, data.edge_index).to(device)

clf = LogisticRegression()
clf.fit(out[data.train_mask], data.y[data.train_mask])
val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
print(val_acc)

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.