In [219]:
!pip install --upgrade pip

!pip install --no-cache-dir torch==2.0.1+cpu torchvision==0.15.2+cpu torchaudio==2.0.2+cpu --index-url https://download.pytorch.org/whl/cpu

!pip install torchdata==0.6.1

!pip install dgl==1.1.2 -f https://data.dgl.ai/wheels/repo.html

!pip install numpy==1.26.4

Looking in indexes: https://download.pytorch.org/whl/cpu
Looking in links: https://data.dgl.ai/wheels/repo.html


In [220]:
import random
import networkx as nx
import numpy as np
import torch
import dgl
from dgl.data import CoraGraphDataset
from sklearn.metrics import roc_auc_score
from node2vec import Node2Vec
from sklearn.neural_network import MLPClassifier
import multiprocessing

In [221]:
# Load dataset
dataset = CoraGraphDataset()
graph = dataset[0]
print(graph)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [222]:
# Convert graph to networkx
g = nx.Graph(graph.to_networkx())
g.remove_edges_from(nx.selfloop_edges(g))
print("Nodes:", g.number_of_nodes())
print("Edges:", g.number_of_edges())
print("Connected: ", nx.is_connected(g))

Nodes: 2708
Edges: 5278
Connected:  False


In [223]:
# Keep largest connected component
largest_connected_component = max(nx.connected_components(g), key=len)
g = g.subgraph(largest_connected_component).copy()

print("Nodes:", g.number_of_nodes())
print("Edges:", g.number_of_edges())
print("Graph connected:", nx.is_connected(g))

Nodes: 2485
Edges: 5069
Graph connected: True


In [224]:
# Train test
edges = list(g.edges())
random.shuffle(edges)

num_test = int(0.1 * len(edges))

training_graph = g.copy()
test_pos_edges = []

for (u, v) in edges:
    if len(test_pos_edges) == num_test:
        break

    training_graph.remove_edge(u, v)

    if nx.is_connected(training_graph):
        test_pos_edges.append((u, v))
    else:
        training_graph.add_edge(u, v)

print("Training edges: ", training_graph.number_of_edges())
print("Positive test edges: ", len(test_pos_edges))
print("Training graph connected: ", nx.is_connected(training_graph))

Training edges:  4563
Positive test edges:  506
Training graph connected:  True


In [225]:
# Negative sampling
nodes = list(g.nodes())
test_neg_edges = set()

while len(test_neg_edges) < len(test_pos_edges):
    u, v = random.sample(nodes, 2)
    if g.has_edge(u, v):
        continue
    test_neg_edges.add((u, v))

test_neg_edges = list(test_neg_edges)

print("Negative test edges: ", len(test_neg_edges))

Negative test edges:  506


In [226]:
# Make to tensors
test_pos_u = torch.tensor([u for u, v in test_pos_edges])
test_pos_v = torch.tensor([v for u, v in test_pos_edges])

test_neg_u = torch.tensor([u for u, v in test_neg_edges])
test_neg_v = torch.tensor([v for u, v in test_neg_edges])

In [227]:
# Auc
def compute_auc(pos_scores, neg_scores):
    scores = torch.cat([pos_scores, neg_scores]).numpy()
    labels = np.concatenate([
        np.ones(len(pos_scores)), np.zeros(len(neg_scores))
    ])
    #print(labels)
    return roc_auc_score(labels, scores)

In [228]:
# Final check
print("Training graph is connected:", nx.is_connected(training_graph))
print("Length of positive edges is equal to negative edges:", len(test_pos_edges) == len(test_neg_edges))

Training graph is connected: True
Length of positive edges is equal to negative edges: True


In [229]:
################################### Heuristics #####################################
def common_neighbors(g, edges):
    scores = []
    for (u, v) in edges:
        cn = len(list(nx.common_neighbors(g, u, v)))
        scores.append(cn)
    return torch.tensor(scores)

pos_scores = common_neighbors(training_graph, test_pos_edges)
neg_scores = common_neighbors(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Common Neighbours AUC:", auc)

def jaccard(g, edges):
    scores = []
    for (u, v) in edges:
        nu = set(g.neighbors(u))
        nv = set(g.neighbors(v))
        union = nu | nv
        if len(union) == 0:
            scores.append(0)
        else:
            scores.append(len(nu & nv) / len(union))
    return torch.tensor(scores)

pos_scores = jaccard(training_graph, test_pos_edges)
neg_scores = jaccard(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Jaccard AUC:", auc)

def adamic_adar(g, edges):
    scores = []
    for (u, v) in edges:
        score = 0
        for w in nx.common_neighbors(g, u, v):
            deg = g.degree(w)
            if deg > 1:
                score += 1 / np.log(deg)
        scores.append(score)
    return torch.tensor(scores)

pos_scores = adamic_adar(training_graph, test_pos_edges)
neg_scores = adamic_adar(training_graph, test_neg_edges)
auc = compute_auc(pos_scores, neg_scores)
print("Adamic Adar AUC:", auc)

Common Neighbours AUC: 0.7586901841928478
Jaccard AUC: 0.7563897264447188
Adamic Adar AUC: 0.7613206736552672


In [230]:
def hadamard(u, v, embeddings):
    return embeddings[u] * embeddings[v]

In [231]:
def make_edge_dataset(pos_edges, neg_edges, embeddings):
    X = []
    y = []

    for u, v in pos_edges:
        X.append(hadamard(u, v, embeddings))
        y.append(1)

    for u, v in neg_edges:
        X.append(hadamard(u, v, embeddings))
        y.append(0)

    return np.array(X), np.array(y)


In [232]:
################################### Embeddings #####################################
node2vec = Node2Vec(training_graph, dimensions=128, walk_length=80, num_walks=10, workers=multiprocessing.cpu_count())
model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 2485/2485 [00:00<00:00, 8186.38it/s]
Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Generating walks (CPU: 2): 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Generating walks (CPU: 5): 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Generating walks (CPU: 11): 0it [00:00, ?it/s]
Generating walks (CPU: 8): 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Generating walks (CPU: 9): 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Generating walks (CPU: 12): 0it [00:00, ?it/s]
Generating walks (CPU: 10): 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


In [233]:
embeddings = {}

for node in training_graph.nodes():
    embeddings[node] = model.wv[str(node)]

In [234]:
def sample_negative_edges(g, num_samples):
    neg_edges = set()
    nodes = list(g.nodes())
    while len(neg_edges) < num_samples:
        u, v = random.sample(nodes, 2)
        if g.has_edge(u, v):
            continue
        neg_edges.add((u, v))
    return list(neg_edges)


In [235]:
train_pos_edges = list(training_graph.edges())
train_neg_edges = sample_negative_edges(training_graph, len(train_pos_edges))

X_train, y_train = make_edge_dataset(train_pos_edges, train_neg_edges, embeddings)
X_test, y_test   = make_edge_dataset(test_pos_edges, test_neg_edges, embeddings)

print(X_train.shape, y_train.shape)


(9126, 128) (9126,)


In [236]:
clf = MLPClassifier(hidden_layer_sizes=(64,), max_iter=300, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,hidden_layer_sizes,"(64,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,300
,shuffle,True


In [237]:
y_score = clf.predict_proba(X_test)[:,1]
print("Test AUC:", roc_auc_score(y_test, y_score))

Test AUC: 0.9086417535034135
