In [71]:
!pip install dgl==2.4.0 -f https://data.dgl.ai/wheels/torch-2.4/repo.html

Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

  Traceback (most recent call last):
    File "/home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site.py", line 195, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named '_distutils_hack'

Remainder of file ignored
Looking in links: https://data.dgl.ai/wheels/torch-2.4/repo.html

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [72]:
import random
import networkx as nx
import numpy as np
import torch
import dgl
from dgl.data import CoraGraphDataset
from sklearn.metrics import roc_auc_score
from node2vec import Node2Vec
from sklearn.neural_network import MLPClassifier
from torch.nn.functional import embedding

In [73]:
# Load dataset
dataset = CoraGraphDataset()
graph = dataset[0]
print(graph)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [96]:
# Convert graph to undirected
g = nx.Graph(graph.to_networkx())
g.remove_edges_from(nx.selfloop_edges(g))
print("Connected: ", nx.is_connected(g))

Connected:  False


In [75]:
# Keep largest connected component
largest_connected_component = max(nx.connected_components(g), key=len)
g = g.subgraph(largest_connected_component).copy()

print("Nodes: ", g.number_of_nodes())
print("Edges: ", g.number_of_edges())
print("Connected: ", nx.is_connected(g))

Nodes:  2485
Edges:  5069
Connected:  True


In [76]:
# Train test
edges = list(g.edges())
random.shuffle(edges)

num_test = int(0.1 * len(edges))

training_graph = g.copy()
test_pos_edges = []

for (u, v) in edges:
    if len(test_pos_edges) == num_test:
        break

    training_graph.remove_edge(u, v)

    if nx.is_connected(training_graph):
        test_pos_edges.append((u, v))
    else:
        training_graph.add_edge(u, v)

print("Training edges: ", training_graph.number_of_edges())
print("Positive test edges: ", len(test_pos_edges))
print("Training graph connected: ", nx.is_connected(training_graph))

Training edges:  4563
Positive test edges:  506
Training graph connected:  True


In [77]:
# Negative sampling
nodes = list(g.nodes())
test_neg_edges = set()

while len(test_neg_edges) < len(test_pos_edges):
    u, v = random.sample(nodes, 2)
    if g.has_edge(u, v):
        continue
    test_neg_edges.add((u, v))

test_neg_edges = list(test_neg_edges)

print("Negative test edges: ", len(test_neg_edges))

Negative test edges:  506


In [78]:
# Make to tensors
test_pos_u = torch.tensor([u for u, v in test_pos_edges])
test_pos_v = torch.tensor([v for u, v in test_pos_edges])

test_neg_u = torch.tensor([u for u, v in test_neg_edges])
test_neg_v = torch.tensor([v for u, v in test_neg_edges])

In [99]:
# Auc
def compute_auc(pos_scores, neg_scores):
    scores = torch.cat([pos_scores, neg_scores]).numpy()
    labels = np.concatenate([
        np.ones(len(pos_scores)), np.zeros(len(neg_scores))
    ])
    #print(labels)
    return roc_auc_score(labels, scores)

In [80]:
# Final check
assert nx.is_connected(training_graph)
assert len(test_neg_edges) == len(test_pos_edges)

In [100]:
################################### Heuristics #####################################
def common_neighbors(g, edges):
    scores = []
    for (u, v) in edges:
        cn = len(list(nx.common_neighbors(g, u, v)))
        scores.append(cn)
    return torch.tensor(scores)

pos_scores = common_neighbors(training_graph, test_pos_edges)
neg_scores = common_neighbors(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Common Neighbours AUC:", auc)

def jaccard(g, edges):
    scores = []
    for (u, v) in edges:
        nu = set(g.neighbors(u))
        nv = set(g.neighbors(v))
        union = nu | nv
        if len(union) == 0:
            scores.append(0)
        else:
            scores.append(len(nu & nv) / len(union))
    return torch.tensor(scores)

pos_scores = jaccard(training_graph, test_pos_edges)
neg_scores = jaccard(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Jaccard AUC:", auc)

def adamic_adar(g, edges):
    scores = []
    for (u, v) in edges:
        score = 0
        for w in nx.common_neighbors(g, u, v):
            deg = g.degree(w)
            if deg > 1:
                score += 1 / np.log(deg)
        scores.append(score)
    return torch.tensor(scores)

pos_scores = adamic_adar(training_graph, test_pos_edges)
neg_scores = adamic_adar(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Adamic Adar AUC:", auc)

[1. 1. 1. ... 0. 0. 0.]
Common Neighbours AUC: 0.7515505632020497
[1. 1. 1. ... 0. 0. 0.]
Jaccard AUC: 0.7510018122451529
[1. 1. 1. ... 0. 0. 0.]
Adamic Adar AUC: 0.7523824774641066


In [82]:
def hadamard(u, v, embeddings):
    return embeddings[u] * embeddings[v]

In [83]:
def make_edge_dataset(pos_edges, neg_edges, embeddings):
    X = []
    y = []

    for u, v in pos_edges:
        X.append(hadamard(u, v, embeddings))
        y.append(1)

    for u, v in neg_edges:
        X.append(hadamard(u, v, embeddings))
        y.append(0)

    return np.array(X), np.array(y)


In [84]:
################################### Embeddings #####################################
node2vec = Node2Vec(training_graph, dimensions=64, walk_length=30, num_walks=200, workers=12)
model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 2485/2485 [00:00<00:00, 3765.37it/s]
Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

  Traceback (most recent call last):
    File "/home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site.py", line 195, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named '_distutils_hack'

Remainder of file ignored
Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

  Traceback (most recent call last):
    File "/home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site.py", line 195, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named '_distutils_hack'

Remainder of file ignored
Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-p

In [85]:
embeddings = {}

for node in g.nodes():
    embeddings[node] = model.wv[str(node)]

In [86]:
def sample_negative_edges(g, num_samples):
    neg_edges = set()
    nodes = list(g.nodes())
    while len(neg_edges) < num_samples:
        u, v = random.sample(nodes, 2)
        if g.has_edge(u, v):
            continue
        neg_edges.add((u, v))
    return list(neg_edges)


In [87]:
train_pos_edges = list(training_graph.edges())
train_neg_edges = sample_negative_edges(training_graph, len(train_pos_edges))

X_train, y_train = make_edge_dataset(train_pos_edges, train_neg_edges, embeddings)
X_test, y_test   = make_edge_dataset(test_pos_edges, test_neg_edges, embeddings)

print(X_train.shape, y_train.shape)


(9126, 64) (9126,)


In [88]:
clf = MLPClassifier(hidden_layer_sizes=(64,), max_iter=300, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,hidden_layer_sizes,"(64,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,300
,shuffle,True


In [89]:
y_score = clf.predict_proba(X_test)[:,1]
print("Test AUC:", roc_auc_score(y_test, y_score))

Test AUC: 0.9141448858754238


In [90]:
g = dataset[0]

print(g)

Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [91]:
def score_edges(h, edges):
    u, v = edges
    return (h[u] * h[v]).sum(dim=1)  # dot product

In [92]:
def train(g, model, features,epochs=100, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    edges = g.edges()
    num_edges = g.num_edges()

    for epoch in range(epochs):
        model.train()

        # Node embeddings
        h = model(g, features)

        # Positive edges
        pos_u, pos_v = edges

        # Negative sampling (online!)
        neg_u, neg_v = dgl.sampling.global_uniform_negative_sampling(
            g, num_edges
        )

        pos_score = score_edges(h, (pos_u, pos_v))
        neg_score = score_edges(h, (neg_u, neg_v))

        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([
            torch.ones_like(pos_score),
            torch.zeros_like(neg_score)
        ])

        loss = loss_fn(scores, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 5 == 0:
            print(f"Epoch {epoch} | Loss: {loss.item():.4f}")

In [93]:
def eval_auc(model, g_train, features):
    model.eval()
    with torch.no_grad():
        h = model(g_train, features)

        pos_score = torch.sigmoid(score_edges(h, (test_pos_u, test_pos_v)))
        neg_score = torch.sigmoid(score_edges(h, (test_neg_u, test_neg_v)))

        auc = compute_auc(pos_score, neg_score)
        print("GCN Link Prediction Test AUC:", auc)


In [94]:
train_g_dgl = dgl.from_networkx(training_graph)
train_g_dgl = dgl.add_self_loop(train_g_dgl)

train_g_dgl.ndata["feat"] = g.ndata["feat"][list(training_graph.nodes())]


In [95]:
from dgl.nn.pytorch import GraphConv


class GCN(torch.nn.Module):
    def __init__(self, in_feats, h_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, h_feats)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = torch.relu(h)
        h = self.conv2(g, h)
        return h


# Create the model with given dimensions
in_feats = train_g_dgl.ndata["feat"].shape[1]
model = GCN(in_feats, 64)

train(train_g_dgl, model, train_g_dgl.ndata["feat"])
eval_auc(model, train_g_dgl, train_g_dgl.ndata["feat"])

Epoch 0 | Loss: 0.6930
Epoch 5 | Loss: 0.6782
Epoch 10 | Loss: 0.6630
Epoch 15 | Loss: 0.6280
Epoch 20 | Loss: 0.5529
Epoch 25 | Loss: 0.5089
Epoch 30 | Loss: 0.4911
Epoch 35 | Loss: 0.4851
Epoch 40 | Loss: 0.4806
Epoch 45 | Loss: 0.4733
Epoch 50 | Loss: 0.4629
Epoch 55 | Loss: 0.4633
Epoch 60 | Loss: 0.4621
Epoch 65 | Loss: 0.4563
Epoch 70 | Loss: 0.4567
Epoch 75 | Loss: 0.4584
Epoch 80 | Loss: 0.4603
Epoch 85 | Loss: 0.4563
Epoch 90 | Loss: 0.4487
Epoch 95 | Loss: 0.4532


IndexError: index 2518 is out of bounds for dimension 0 with size 2485