Student: OUARDIGHI Omar \
Github Repo :


Some of the code is adapted from the following tutorial for link prediction: https://docs.dgl.ai/en/0.6.x/tutorials/blitz/4_link_predict.html#sphx-glr-tutorials-blitz-4-link-predict-py

In [1]:
!pip install pyg_lib torch_geometric torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html


In [2]:
# install DGL in Colab
!pip install dgl -f https://data.dgl.ai/wheels/repo.html

Looking in links: https://data.dgl.ai/wheels/repo.html


In [3]:
import networkx as nx
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, roc_curve
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

import scipy.sparse as sp
import dgl

# Data Loading

In [4]:
G_fb = nx.read_edgelist("/content/1912.edges", create_using = nx.Graph(), nodetype = int)
num_edges = G_fb.number_of_edges()
print("Number of edges:", num_edges)

Number of edges: 1235


In [5]:
node_features = np.loadtxt("/content/1912.feat", dtype=float)
# Iterate over rows and add features to nodes in the graph
for node_id, *features in node_features:
    node_id = int(node_id)
    features = list(map(float, features))

    # Check if the node exists in the graph before modifying it
    if G_fb.has_node(node_id):
        G_fb.nodes[node_id]['features'] = features
    else:
        print(f"Node {node_id} does not exist in the graph.")


Node 1933 does not exist in the graph.
Node 1949 does not exist in the graph.
Node 1956 does not exist in the graph.
Node 1961 does not exist in the graph.
Node 1969 does not exist in the graph.
Node 2008 does not exist in the graph.
Node 58 does not exist in the graph.
Node 2048 does not exist in the graph.
Node 2051 does not exist in the graph.


In [6]:
# Convert the NetworkX graph to a DGL graph
g = dgl.from_networkx(G_fb)
node_features_dict = nx.get_node_attributes(G_fb, 'features')

# Ensure that node features are aligned with node IDs
node_features = [node_features_dict.get(node_id, [0.0]* 480) for node_id in G_fb.nodes()]

g.ndata['feat'] = torch.tensor(node_features, dtype=torch.float32)

## Edge Spilitting

In [7]:


# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.num_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.num_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.num_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.num_edges())
test_neg_u, test_neg_v = (
    neg_u[neg_eids[:test_size]],
    neg_v[neg_eids[:test_size]],
)
train_neg_u, train_neg_v = (
    neg_u[neg_eids[test_size:]],
    neg_v[neg_eids[test_size:]],
)

In [8]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [9]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.num_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.num_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.num_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.num_nodes())

# Modeling

In [10]:
from dgl.nn import SAGEConv
import torch.nn as nn

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [11]:
import dgl.function as fn


class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v("h", "h", "score"))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata["score"][:, 0]

In [12]:
train_g

Graph(num_nodes=147, num_edges=2223,
      ndata_schemes={'feat': Scheme(shape=(480,), dtype=torch.float32)}
      edata_schemes={})

In [13]:
model = GraphSAGE(train_g.ndata["feat"].shape[1], 16)

pred = DotPredictor()


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    )
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

In [14]:
import itertools

# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()), lr=0.01
)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata["feat"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print("AUC", compute_auc(pos_score, neg_score))



In epoch 0, loss: 1.0254733562469482
In epoch 5, loss: 0.6855635643005371
In epoch 10, loss: 0.6629875302314758
In epoch 15, loss: 0.6120024919509888
In epoch 20, loss: 0.5624158382415771
In epoch 25, loss: 0.5396445989608765
In epoch 30, loss: 0.5092583298683167
In epoch 35, loss: 0.4851110577583313
In epoch 40, loss: 0.45005401968955994
In epoch 45, loss: 0.4125015139579773
In epoch 50, loss: 0.3971270024776459
In epoch 55, loss: 0.3890410363674164
In epoch 60, loss: 0.37836721539497375
In epoch 65, loss: 0.3723665773868561
In epoch 70, loss: 0.3676365613937378
In epoch 75, loss: 0.36239737272262573
In epoch 80, loss: 0.35784998536109924
In epoch 85, loss: 0.353302925825119
In epoch 90, loss: 0.3487794101238251
In epoch 95, loss: 0.34436407685279846
AUC 0.9419429920175711


In [15]:
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)

    # Assuming you have ground truth labels for positive and negative examples
    true_labels_pos = torch.ones_like(pos_score)
    true_labels_neg = torch.zeros_like(neg_score)

    all_true_labels = torch.cat([true_labels_pos, true_labels_neg], dim=0)
    all_scores = torch.cat([pos_score, neg_score], dim=0)

    # Threshold scores to get binary predictions
    predictions = (all_scores >= 0.5).float()

    # Calculate accuracy and precision
    correct_predictions = (predictions == all_true_labels).float()
    accuracy = torch.mean(correct_predictions).item()

    true_positive = torch.sum(correct_predictions * (all_true_labels == 1).float()).item()
    predicted_positive = torch.sum(predictions).item()



    print("AUC:", compute_auc(pos_score, neg_score))
    print("Accuracy:", accuracy)
    print("Precision:", precision)

AUC: 0.9419429920175711
Accuracy: 0.8825910687446594
Precision: 0.8436363636363636


## pGNNNet model

In [16]:
# Define the pGNNNet model
from pgnn_conv import pGNNConv
class pGNNNet(torch.nn.Module):
    def __init__(self,
                 in_channels,
                 num_hid=16,
                 mu=0.1,
                 p=2,
                 K=2,
                 dropout=0.5,
                 cached=True):
        super(pGNNNet, self).__init__()
        self.dropout = dropout
        self.lin1 = torch.nn.Linear(in_channels, num_hid)
        self.conv1 = pGNNConv(num_hid, 1, mu, p, K, cached=cached)  # Output size is 1 for binary classification

    def forward(self, x, edge_index, edge_weight=None):
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv1(x, edge_index, edge_weight)
        return x


In [17]:
test_data

NameError: ignored

In [None]:

# Instantiate the model
model = pGNNNet(in_channels=node_feature_matrix.shape[1], num_hid=16, mu=0.1, p=2, K=2, dropout=0.5)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

# Convert data to PyTorch DataLoader
train_loader = DataLoader(train_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)




In [None]:
model = pGNNNet(train_g.ndata["feat"].shape[1])

pred = DotPredictor()

In [None]:
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()), lr=0.01
)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata["feat"].float())
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score

with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print("AUC", compute_auc(pos_score, neg_score))
