In [1]:
import random
import networkx as nx
import numpy as np
import torch
from dgl.data import CoraGraphDataset
from sklearn.metrics import roc_auc_score
from node2vec import Node2Vec
from torch.nn.functional import embedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
dataset = CoraGraphDataset()
graph = dataset[0]
print(graph)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [3]:
# Convert graph to undirected
g = nx.Graph(graph.to_networkx())
g.remove_edges_from(nx.selfloop_edges(g))

In [4]:
# Keep largest connected component
largest_connected_component = max(nx.connected_components(g), key=len)
g = g.subgraph(largest_connected_component).copy()

print("Nodes: ", g.number_of_nodes())
print("Edges: ", g.number_of_edges())
print("Connected: ", nx.is_connected(g))

Nodes:  2485
Edges:  5069
Connected:  True


In [5]:
# Train test
edges = list(g.edges())
random.shuffle(edges)

num_test = int(0.1 * len(edges))

training_graph = g.copy()
test_pos_edges = []

for (u, v) in edges:
    if len(test_pos_edges) == num_test:
        break

    training_graph.remove_edge(u, v)

    if nx.is_connected(training_graph):
        test_pos_edges.append((u, v))
    else:
        training_graph.add_edge(u, v)

print("Training edges: ", training_graph.number_of_edges())
print("Positive test edges: ", len(test_pos_edges))
print("Training graph connected: ", nx.is_connected(training_graph))

Training edges:  4563
Positive test edges:  506
Training graph connected:  True


In [6]:
# Negative sampling
nodes = list(g.nodes())
test_neg_edges = set()

while len(test_neg_edges) < len(test_pos_edges):
    u, v = random.sample(nodes, 2)
    if g.has_edge(u, v):
        continue
    test_neg_edges.add((u, v))

test_neg_edges = list(test_neg_edges)

print("Negative test edges: ", len(test_neg_edges))

Negative test edges:  506


In [7]:
# Make to tensors
test_pos_u = torch.tensor([u for u, v in test_pos_edges])
test_pos_v = torch.tensor([v for u, v in test_pos_edges])

test_neg_u = torch.tensor([u for u, v in test_neg_edges])
test_neg_v = torch.tensor([v for u, v in test_neg_edges])

In [8]:
# Auc
def compute_auc(pos_scores, neg_scores):
    scores = torch.cat([pos_scores, neg_scores]).numpy()
    labels = np.concatenate([
        np.ones(len(pos_scores)), np.zeros(len(neg_scores))
    ])
    return roc_auc_score(labels, scores)

In [9]:
# Final check
assert nx.is_connected(training_graph)
assert len(test_neg_edges) == len(test_pos_edges)

In [11]:
################################### Heuristics #####################################
def common_neighbors(g, edges):
    scores = []
    for (u, v) in edges:
        cn = len(list(nx.common_neighbors(g, u, v)))
        scores.append(cn)
    return torch.tensor(scores)

pos_scores = common_neighbors(training_graph, test_pos_edges)
neg_scores = common_neighbors(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Common Neighbours AUC:", auc)

def jaccard(g, edges):
    scores = []
    for (u, v) in edges:
        nu = set(g.neighbors(u))
        nv = set(g.neighbors(v))
        union = nu | nv
        if len(union) == 0:
            scores.append(0)
        else:
            scores.append(len(nu & nv) / len(union))
    return torch.tensor(scores)

pos_scores = jaccard(training_graph, test_pos_edges)
neg_scores = jaccard(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Jaccard AUC:", auc)

def adamic_adar(g, edges):
    scores = []
    for (u, v) in edges:
        score = 0
        for w in nx.common_neighbors(g, u, v):
            deg = g.degree(w)
            if deg > 1:
                score += 1 / np.log(deg)
        scores.append(score)
    return torch.tensor(scores)

pos_scores = adamic_adar(training_graph, test_pos_edges)
neg_scores = adamic_adar(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Adamic Adar AUC:", auc)

Common Neighbours AUC: 0.7793825868237279
Jaccard AUC: 0.7780175444078176
Adamic Adar AUC: 0.7802242653376869


In [12]:
################################### Embeddings #####################################
node2vec = Node2Vec(g, dimensions=64, walk_length=30, num_walks=200, workers=2)
model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 2485/2485 [00:01<00:00, 2115.38it/s]
Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

Error processing line 1 of /home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site-packages/distutils-precedence.pth:

  Traceback (most recent call last):
    File "/home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site.py", line 195, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named '_distutils_hack'

Remainder of file ignored
  Traceback (most recent call last):
    File "/home/thomas/miniconda3/envs/LinkPrediction/lib/python3.10/site.py", line