https://medium.com/analytics-vidhya/ohmygraphs-graphsage-in-pyg-598b5ec77e7b

In [1]:
import torch
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from torch_geometric.loader import LinkNeighborLoader
import copy
from tqdm import tqdm 
from sklearn.linear_model import LogisticRegression
import time
from torch_geometric.transforms import RandomLinkSplit


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


# Data: CORA

In [39]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]
data = data.to(device)


# Split data 
https://medium.com/stanford-cs224w/a-tour-of-pygs-data-loaders-9f2384e48f8f

https://zqfang.github.io/2021-08-12-graph-linkpredict/ --> trasductie disjoint mode 

In [38]:
transform = RandomLinkSplit(
    num_val=0.2,  # fraction of data held out for validation
    num_test=0.2, # fraction of data held out for test
    is_undirected= True, 
    add_negative_train_samples= True, 
    neg_sampling_ratio= 1.0, # Need to include negative sampling edges, the edges not existed in the original graph.
    key = 'edge_label', 
    disjoint_train_ratio = 0.9
)
train_data, val_data, test_data = transform(data)

# Hyperparameters

In [25]:
learning_rate = 0.0001 # variable to change/play around with for experiments
epochs = 10
aggregator = 'mean' # variable to change/play around with for experiments
dropout_rate = 0.4
normalization = True
activation_function = True
bias = True
batch =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128

# NeighborLoader

In [26]:
train_loader = LinkNeighborLoader(train_data, 
                            num_neighbors=[neighborhood_1, neighborhood_2],
                            shuffle=True,
                            neg_sampling_ratio= 1.0, 
                            batch_size = batch)

# TODO: not sure if subgraph is needed
subgraph_loader = LinkNeighborLoader(copy.copy(train_data), 
                                num_neighbors=[-1], 
                                shuffle=True,
                               neg_sampling_ratio= 1.0, 
                                batch_size = batch )

# No need to maintain these features during evaluation:
del subgraph_loader.data.x, subgraph_loader.data.y
# Add global node index information.
subgraph_loader.data.num_nodes = data.num_nodes
subgraph_loader.data.n_id = torch.arange(data.num_nodes)

# Make model for Link prediction

Initialization:
https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.SAGEConv.html

https://medium.com/@juyi.lin/neighborloader-introduction-ccb870cc7294

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [27]:
class GraphSAGE_local(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, aggr='mean', normalization = True, activation_function = True, bias = True):

        super().__init__()
        # as K = 2, we have 2 layers
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, out_channels = hidden_channels, project = activation_function, bias = bias)
        self.conv2 = SAGEConv(hidden_channels, out_channels = out_channels, project = activation_function, bias = bias)
        #self.conv3 = SAGEConv(128, out_channels = out_channels, normalize = normalization, project = activation_function, bias = bias)


    def forward(self, matrix_nodes_features, edge_index):
      # matrix_nodes_features is a matrix from the data where row = nodes, columns = feature
      # edge_index: This is a tensor that describes the connectivity of the graph. Each column in this matrix represents an edge. The first row contains the indices of the source nodes, and the second row contains the indices of the target nodes.
    
        x = self.conv1(matrix_nodes_features, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout)

        #x = self.conv3(x, edge_index)
        #x = F.relu(x)
        #x = F.dropout(x, p=self.dropout)

        # returning only x allows to apply any task-specific post-processing (in this case: link prediction and node classification)
        return x 
    
    @torch.no_grad()
    def inference(self, x_all, subgraph_loader):
        # designed to handle large graphs that don't fit into memory all at once. 
        # Instead of processing the entire graph at once, it processes the graph in smaller subgraphs or batches
        
        for i, conv in enumerate([self.conv1, self.conv2, self.conv3]):
            xs = []
            for batch in subgraph_loader:
                x = x_all[batch.n_id.to(x_all.device)]
                x = conv(x, batch.edge_index) 
                #x = x_all[batch.n_id.to(x_all.device)].to(device)
                #x = conv(x, batch.edge_index.to(device))
                if i < len(self.convs) - 1:
                    x = x.relu_()
                xs.append(x[:batch.batch_size].cpu()) # we only need the representations of the target nodes
            x_all = torch.cat(xs, dim=0)
        return x_all

# Create model for link prediction


In [28]:
labels_all_nodes = data.y
number_classes  = labels_all_nodes.unique().size(0)

model = GraphSAGE_local(in_channels = data.num_node_features,
                  hidden_channels= data.num_node_features,
                  out_channels = number_classes, # TODO: check results using 128 instead 
                  dropout= dropout_rate,
                  aggr = aggregator,
                  normalization = normalization,
                  activation_function = activation_function,
                  bias = bias)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#model.to(device)

# Train model with unsupervised loss 
Where model is trainned to fit training data 
 The function iterates over batches of data from train_loader. Each batch contains a subset of the entire training dataset
 For each batch, the model computes the node embeddings h, then calculates the embeddings for the source h_src and destination h_dst nodes of each edge. It then predicts whether an edge should exist between node pairs (pred)

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [29]:
def train(epoch):
    model.train() # set model into training mode, doesnt do anything else 
    
    total_loss = 0
    
    for batch in train_loader:
        #batch = batch.to(device)
        optimizer.zero_grad()
        
        h = model(batch.x, batch.edge_index)
        h_src = h[batch.edge_label_index[0]]
        h_dst = h[batch.edge_label_index[1]]
        pred = (h_src * h_dst).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, batch.edge_label)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.size(0)

    return total_loss / data.num_nodes


# Test method
The test function evaluates the model's performance on unseen data. 

https://github.com/pyg-team/pytorch_geometric/blob/master/examples/graph_sage_unsup.py

In [30]:
@torch.no_grad()
def test():
    model.eval()
    out = model(train_data.x, train_data.edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    train_acc = clf.score(out, train_data.y) #TODO: fix this data 
    val_acc = clf.score(out, train_data.y) # TODO: fix this data 

    return train_acc, val_acc

times = []

In [None]:
@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    train_acc = clf.score(out[data.train_mask], data.y[data.train_mask]) #TODO: fix this data 
    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask]) # TODO: fix this data 

    return train_acc, val_acc

times = []

# Run training

In [31]:
times = []
for epoch in range(1, epochs):
    start = time.time()
    loss = train(epoch)
    train_acc, val_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
          f'Val: {val_acc:.4f}, Train: {train_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")


Epoch: 001, Loss: 3.2272, Val: 0.1935, Train: 0.1935
Epoch: 002, Loss: 3.2028, Val: 0.1370, Train: 0.1370
Epoch: 003, Loss: 3.1907, Val: 0.1551, Train: 0.1551
Epoch: 004, Loss: 3.1608, Val: 0.1702, Train: 0.1702
Epoch: 005, Loss: 3.1326, Val: 0.2208, Train: 0.2208
Epoch: 006, Loss: 3.0997, Val: 0.2153, Train: 0.2153
Epoch: 007, Loss: 3.0658, Val: 0.2223, Train: 0.2223
Epoch: 008, Loss: 3.0370, Val: 0.2057, Train: 0.2057
Epoch: 009, Loss: 3.0359, Val: 0.2530, Train: 0.2530
Median time per epoch: 2.0081s


In [None]:
train_data.edge_label 

Data(x=[2423, 1433], edge_index=[2, 7907], y=[2423], train_mask=[2423], val_mask=[2423], test_mask=[2423], n_id=[2423], e_id=[7907], input_id=[256], edge_label_index=[2, 512], edge_label=[512])


# Get results for test data

In [None]:
model.eval()
out = model(data.x, data.edge_index).to(device)

clf = LogisticRegression()
clf.fit(out[data.train_mask], data.y[data.train_mask])
val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
print(val_acc)