This work is highly inspired in the following repositories: 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/ogbn_products_sage.py

https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python/blob/main/Chapter08/chapter8.ipynb 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py


# Imports

In [2]:
import torch
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader
import time
from sklearn.metrics import f1_score 

In [3]:
if torch.backends.mps.is_available() and False:
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    device = torch.device('cpu')

# Data: CORA - small version 

In [4]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]
data = data.to(device)

## Visualize Information from graph

In [5]:
# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Training nodes: {sum(data.train_mask).item()}')
print(f'Evaluation nodes: {sum(data.val_mask).item()}')
print(f'Test nodes: {sum(data.test_mask).item()}')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

Dataset: Cora()
-------------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7

Graph:
------
Training nodes: 140
Evaluation nodes: 500
Test nodes: 1000
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


# Hyperparameters

In [29]:
learning_rate = 0.01 # variable to change/play around with for experiments --> 0.0001
epochs = 10
aggregator = 'mean' # variable to change/play around with for experiments
dropout_rate = 0.4
normalization = True
activation_function = True
bias = True
batch =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
num_negative_samples = 20 
weight_decay = 0.0005 #0.0 

AttributeError: 'GlobalStorage' object has no attribute 'edge_label'

# NeighborLoader

In [31]:
train_loader = NeighborLoader(data, 
                            input_nodes=data.train_mask, # ensure that the sampling only happens in the training set 
                            batch_size = batch,
                            num_neighbors=[neighborhood_1, neighborhood_2], 
                            shuffle=True)

# Make model for Link prediction

Initialization:
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.SAGEConv.html

https://medium.com/@juyi.lin/neighborloader-introduction-ccb870cc7294

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python/blob/main/Chapter08/chapter8.ipynb

In [6]:
def accuracy(pred_y, y): 
    return ((pred_y == y).sum()/len(y)).item()

In [36]:
class GraphSAGE_local(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, aggr='mean', normalization = True, activation_function = True, bias = True):

        super().__init__()
        # as K = 2, we have 2 layers
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, out_channels = hidden_channels, project = activation_function, bias = bias)
        self.conv2 = SAGEConv(hidden_channels, out_channels = out_channels, project = activation_function, bias = bias, normalization = normalization)
    

    def forward(self, matrix_nodes_features, edge_index):
      # matrix_nodes_features is a matrix from the data where row = nodes, columns = feature
      # edge_index: This is a tensor that describes the connectivity of the graph. Each column in this matrix represents an edge. The first row contains the indices of the source nodes, and the second row contains the indices of the target nodes.
    
        h = self.conv1(matrix_nodes_features, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=self.dropout, training = self.training)

        h = self.conv2(h, edge_index)
        h = F.relu(h) 
        h = F.dropout(h,  p=self.dropout, training = self.training)
        #h = F.log_softmax(h, dim = 1)
        return h
    
    def fit(self, loader, epochs):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        #criterion = UnsupervisedGraphSAGELoss(margin=1.0, negative_sample_size=num_negative_samples)
        self.train()
        times = []

        for epoch in range(epochs+1):
            start = time.time()
            train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0 
            
            for batch in loader:
                print(batch.edge_label)
                batch = batch.to(device)
                optimizer.zero_grad()
                embeddings = self.forward(batch.x, batch.edge_index) # obtain the probability of belonging to each class or label for each node 
                
                # TODO 
                z_u = embeddings  # Target nodes
                z_v = 0  # Context nodes (positive samples), to be obtained as per your data
                z_neg_v = 0

                # Calculate the contrastive loss
                loss =  0
                
                # Backpropagation
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            avg_loss = total_loss / len(loader)
            print(f"Epoch {epoch}: Loss {avg_loss}")
        
    @torch.no_grad()
    def test(self, data):
        self.eval()
        out = self(data.x, data.edge_index)
        y = data.y[data.test_mask]
        y_prediction = out.argmax(dim = 1)[data.test_mask]

        acc = accuracy(y_prediction, y)
        f1_macro = f1_score(y, y_prediction, average = 'macro')
        f1_micro =  f1_score(y, y_prediction, average = 'micro')
        return acc, f1_macro, f1_micro

# Test method
The test function evaluates the model's performance on unseen data. 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

# Running code

# 1. Create model for node classification


In [37]:
labels_all_nodes = data.y
number_classes  = labels_all_nodes.unique().size(0)

model = GraphSAGE_local(in_channels = data.num_node_features,
                  hidden_channels= data.num_node_features,
                  out_channels = embedding_dimension,
                  dropout= dropout_rate,
                  aggr = aggregator,
                  normalization = normalization,
                  activation_function = activation_function,
                  bias = bias)
model.to(device)
print(model)

GraphSAGE_local(
  (conv1): SAGEConv(1433, 1433, aggr=mean)
  (conv2): SAGEConv(1433, 128, aggr=mean)
)


# 2. Training mdel for certain number of epochs and testing it 

In [38]:
model.fit(train_loader, epochs)


AttributeError: 'GlobalStorage' object has no attribute 'edge_label'

## 3. Calculate accuracy on test data


In [17]:
acc, f1_macro, f1_micro = model.test(data)
print(f'Model accuarcy: {acc*100:.2f}%' )
print(f'Model f1 Macro: {f1_macro:.2f}%' )
print(f'Model f1 Micro: {f1_micro:.2f}%' )

Model accuarcy: 59.50%
Model f1 Macro: 0.47%
Model f1 Micro: 0.59%
