# **Graph Link Prediction**

# 1. Preprocessing

## 1.1. Libraries

In [3]:
import torch
import torch.nn as nn
import dgl
import random
import numpy as np
import networkx as nx
from karateclub import Node2Vec
from dgl.data import CoraGraphDataset
from sklearn.metrics import roc_auc_score
import torch.optim as optim
from sklearn.model_selection import train_test_split

## 1.2. Load dataset

Φορτώνουμε το **CoraGraphDataset** dataset απο την βιβλιοθήκη **DGL**, το μετατρέπουμε σε **undirected graph** και παίρνουμε το μεγαλύτερο **connected component**

In [4]:
dataset = CoraGraphDataset()
g = dataset[0]

nx_g = g.to_networkx()
G_und = nx_g.to_undirected()

G_simple = nx.Graph(G_und)

components = list(nx.connected_components(G_simple))
giant = max(components, key=len)
print("Original nodes:", g.num_nodes(), "Original edges:", g.num_edges())
print("Number of components (undirected):", len(components))
print("Largest component size:", len(giant))

G = G_simple.subgraph(giant).copy()

print("Nodes in largest component (undirected):", G.number_of_nodes())
print("Edges in largest component (undirected):", G.number_of_edges())


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Original nodes: 2708 Original edges: 10556
Number of components (undirected): 78
Largest component size: 2485
Nodes in largest component (undirected): 2485
Edges in largest component (undirected): 5069


## 1.3. G_train preparation

Παίρνουμε προτεινόμενες ακμές από το γραφήμα **G** κάνοντας ελεγχος για **self-loops**, **multiple edges** και εάν έχει **bridges** μεταξύ ακμών

In [5]:
# Check for self-loops and multiple edges for candidate edges
edges = set()

for u,v in G.edges():
    if u != v:
        edges.add((min(u, v), max(u, v)))

# Get G's bridges
bridges = set()
for u,v in nx.bridges(G):
    edge = (min(u, v), max(u, v))
    bridges.add(edge)

# Check if candidate edges are bridges
candidate_edges = []

for e in edges:
    if e not in bridges:
        candidate_edges.append(e)

candidate_num_edges = len(candidate_edges)
percentage_to_remove = 0.10

num_to_remove = int(candidate_num_edges * percentage_to_remove)

edges_removed = []

G_train = G.copy()

candidate_edges_shuffled = candidate_edges.copy()
random.shuffle(candidate_edges_shuffled)

Στην συνέχεια αφαιρούμε 10% των προτεινόμενων ακμών στο **G_Train** και μετα την αφαίρεση ακμών ελέγχουμε εάν το υπολειπόμενο γράφημα **G_Train** παραμένει **connected**

In [6]:
for u, v in candidate_edges_shuffled:
    if len(edges_removed) == num_to_remove:
        break

    # Check degree first
    if G_train.degree(u) <= 1 or G_train.degree(v) <= 1:
        continue

    # Check if edge is currently a bridge
    if (u, v) in nx.bridges(G_train) or (v, u) in nx.bridges(G_train):
        continue

    G_train.remove_edge(u, v)
    edges_removed.append((u, v))

assert nx.is_connected(G_train), "Training graph is disconnected"

# Test edges
test_positive_edges = edges_removed
nodes = list(G_train.nodes())

## 1.4. Negative Sampling

Παίρνουμε και τις **αρνητικές ακμές**(ακμές που δεν υπάρχουν μεταξύ κόμβων), έτσι ώστε να βάλουμε **labels** για θετίκες[1] και αρνητικές ακμες[0] που θα χρειαστούν για το **training** του μοντελού μας πιο μετά

In [7]:
test_negative_edges = []

for _ in range(num_to_remove):
    u,v = random.sample(nodes, 2)

    if not G_train.has_edge(u,v):
        test_negative_edges.append((u,v))

all_test_edges = test_positive_edges + test_negative_edges

test_labels_heu = [1]*len(test_positive_edges) + [0]*len(test_negative_edges)

# 2. Heuristic methods

Τρέχουμε 3 ευριστικές μεθόδους **Common Neighbors**, **Adamic–Adar Index** και **Jaccard Coefficient** πάνω στο υπολειπόμενο γράφημα **G_train** χρησιμοποιώντας τις **labeled** συνολικές ακμές

In [8]:
common_neighbors_scores = []

for u,v in all_test_edges:
    common_neighbors = list(nx.common_neighbors(G_train, u,v))
    score = len(common_neighbors)
    common_neighbors_scores.append(score)

aa_index_scores = []

aa_index = list(nx.adamic_adar_index(G_train, all_test_edges))
for u,v,score in aa_index:
    aa_index_scores.append(score)

jaccard_scores = []

jaccard = list(nx.jaccard_coefficient(G_train, all_test_edges))
for u,v,score in jaccard:
    jaccard_scores.append(score)

common_neighboors_auc = roc_auc_score(test_labels_heu, common_neighbors_scores)
aa_index_auc = roc_auc_score(test_labels_heu, aa_index_scores)
jaccard_auc = roc_auc_score(test_labels_heu, jaccard_scores)

print("Common Neighbors AUC:", common_neighboors_auc)
print("Adamic-Adar AUC:", aa_index_auc)
print("Jaccard AUC:", jaccard_auc)


Common Neighbors AUC: 0.756887906020227
Adamic-Adar AUC: 0.7575828004244489
Jaccard AUC: 0.7564113399254396


# 3. Feature extraction and Shallow embeddings (Node2Vec)

In [9]:
# Shallow embeddings
model = Node2Vec(dimensions=128,walk_length=80,walk_number=10,p=1,q=1)


**Reindexing** διότι o **Node2Vec** απαιτεί από τους κόμβους του γραφήματος **G** να είναι αριθμημένοι σωστά απο **0...Ν-1**

In [10]:
# Reindexing
mapping = {}

for new_index, old_node in enumerate(G.nodes()):
    mapping[old_node] = new_index

G_reindexed = nx.relabel_nodes(G, mapping)

model.fit(G_reindexed)
embeddings = model.get_embedding()
print("Embeddings shape:", embeddings.shape)
print("First node embedding (first 5 values):", embeddings[0][:5])
print("Embedding mean/std:", embeddings.mean(), embeddings.std())


[[ 0.64758724 -0.13368495 -0.9804726  ...  0.43957293  0.6912305
  -0.4849253 ]
 [ 2.144873    2.0424497  -0.04787344 ... -1.7461174  -0.4515117
  -0.18907961]
 [ 0.60790217  0.6730887  -0.38599363 ... -0.01238189 -0.45242012
   0.7232726 ]
 ...
 [-1.364041   -0.9063817   0.04086403 ... -0.27620184 -1.2397317
  -0.50661546]
 [ 0.76336634 -1.1451117  -0.05941312 ... -0.3435472  -0.27481073
   0.6760372 ]
 [ 0.53560096 -0.52513343 -1.0378287  ...  0.01657413 -0.90016294
  -0.48899803]]
Embeddings shape: (2485, 128)
First node embedding (first 5 values): [ 0.64758724 -0.13368495 -0.9804726   0.88186586  0.80770665]
Embedding mean/std: -0.018136656 0.84856576


Υπολόγιζουμε το **Hadamard product** **$h_{uv} = z_u \odot z_v$** για κάθε ζευγος **$(u, v)$** **positive** και **negative** ακμών

In [11]:
# Hadamard product for test edges
positive_edge_embeddings = []
negative_edge_embeddings = []

for u,v in  test_positive_edges:
    u_i = mapping[u]
    v_i = mapping[v]
    positive_edge_product = embeddings[u_i] * embeddings[v_i]
    positive_edge_embeddings.append(positive_edge_product)

for u,v in test_negative_edges:
    u_i = mapping[u]
    v_i = mapping[v]
    negative_edge_product = embeddings[u_i] * embeddings[v_i]
    negative_edge_embeddings.append(negative_edge_product)



## 3.1. Test Set

Δημιουργούμε για το **test** σύνολο **feature embedings** **(X_train)** και **labels** **(Y_train)** για το τελικο **evaluation**

In [12]:
# Test set
X_test = np.array(positive_edge_embeddings + negative_edge_embeddings, dtype=np.float32)
X_test = torch.tensor(X_test)
#print("X_test: ",X_test)

test_labels = [1]*len(positive_edge_embeddings) + [0]*len(negative_edge_embeddings)
Y_test = np.array(test_labels, dtype=np.float32)
Y_test = torch.tensor(Y_test).unsqueeze(1)
#print("Y_test : ",Y_test)

## 3.2. Training edges and Train/Evaluation split sets

Χρησιμοποιούμε το γράφημα **G_train** για να παρουμε θετικές και αρνητικές ακμές

In [13]:
# Training edges
train_positive_edges = list(G_train.edges())
train_nodes = list(G_train.nodes())
train_negative_edges = []

while len(train_negative_edges) < len(train_positive_edges):
    u,v = random.sample(train_nodes, 2)
    if not G_train.has_edge(u,v):
        train_negative_edges.append((u, v))


Χωρίζουμε τα **train_positive_edges** και **train_negative_edges** σε **80% train** και **20% validation**

In [14]:
# Evaluation and Train split sets

train_possitive, val_possitive = train_test_split(train_positive_edges,
                                        test_size = 0.2,
                                        random_state = 42,
                                        shuffle = True)

train_negative, val_negative = train_test_split(train_negative_edges,
                                        test_size = 0.2,
                                        random_state = 42,
                                        shuffle = True)

Δημιουργούμε για το **train** σύνολο **feature embedings** **(X_train)** και **labels** **(Y_train)** για να εκπαιδεύσουμε το μοντέλο μας

In [17]:
X_train = []
Y_train = []

for u,v in train_possitive:
    u_i = mapping[u]
    v_i = mapping[v]
    edge_embedding = embeddings[u_i] * embeddings[v_i]
    X_train.append(edge_embedding)
    Y_train.append(1)

for u,v in train_negative:
    u_i = mapping[u]
    v_i = mapping[v]
    edge_embedding = embeddings[u_i] * embeddings[v_i]
    X_train.append(edge_embedding)
    Y_train.append(0)

X_train = torch.tensor(np.array(X_train), dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.float32).unsqueeze(1)

perm = torch.randperm(X_train.size(0))

X_train = X_train[perm]
Y_train = Y_train[perm]

print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("First 5 labels:", Y_train[:5].T)
print("Positive/Negative ratio:",Y_train.sum().item(), "/", len(Y_train))

X_train shape: torch.Size([7370, 128])
Y_train shape: torch.Size([7370, 1])
First 5 labels: tensor([[0., 0., 0., 1., 1.]])
Positive/Negative ratio: 3685.0 / 7370


Δημιουργούμε για το **validation** σύνολο **feature embedings** **(X_train)** και **labels** **(Y_train)** για το συνεχές **monitoring** των αποτελεσμάτων

In [18]:
X_val = []
Y_val = []

for u, v in val_possitive:
    u_i = mapping[u]
    v_i = mapping[v]
    edge_embedding = embeddings[u_i] * embeddings[v_i]
    X_val.append(edge_embedding)
    Y_val.append(1)

for u, v in val_negative:
    u_i = mapping[u]
    v_i = mapping[v]
    edge_embedding = embeddings[u_i] * embeddings[v_i]
    X_val.append(edge_embedding)
    Y_val.append(0)

X_val = torch.tensor(np.array(X_val), dtype=torch.float32)
Y_val = torch.tensor(Y_val, dtype=torch.float32).unsqueeze(1)

# Shuffle
perm = torch.randperm(X_val.size(0))

X_val = X_val[perm]
Y_val = Y_val[perm]

print("X_val shape:", X_val.shape)
print("Y_val shape:", Y_val.shape)
print("First 5 labels:", Y_val[:5].T)
print("Positive/Negative ratio:", Y_val.sum().item(), "/", len(Y_val))

X_val shape: torch.Size([1844, 128])
Y_val shape: torch.Size([1844, 1])
First 5 labels: tensor([[0., 1., 1., 0., 0.]])
Positive/Negative ratio: 922.0 / 1844


# 4) Simple MLP setup

To μοντέλο MLP εχει 3 Linear Layers για **criterion** χρησιμοποιούμε των **BCEWithLogitsLoss** της βιβλιοθήκης **torch.nn** και για **optimizer** τον **Adam** της βιβλιοθήκης **torch.optim**


In [19]:
class MLP(nn.Module):
    def __init__(self,embedding_dimension):
        super(MLP,self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(embedding_dimension, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,1)
        )

    def forward(self,x):
        return self.linear_relu_stack(x)

# Setup model
embedding_dimension = embeddings.shape[1]
model_mlp = MLP(embedding_dimension=embedding_dimension)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_mlp.parameters(), lr = 0.001)

# 5) Train MLP model

Περνάμε στις παραμετρους τα **feature embedings** **(X_train)** και **labels** **(Y_train)** του συνολου **validation** και **train** τον **optimizer** και **criterion** και των αριθμο passthrough στο **training set** **(epochs)**

In [24]:
def train(model, X_train, Y_train, X_val, Y_val, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        logits = model(X_train)
        loss = criterion(logits, Y_train)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_logits = model(X_val)
            val_probs = torch.sigmoid(val_logits)
            val_auc = roc_auc_score(Y_val.cpu().numpy(), val_probs.cpu().numpy())

        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Val AUC: {val_auc:.4f}")

# Train model
train(model = model_mlp,
      X_train = X_train,
      Y_train = Y_train,
      X_val = X_val,
      Y_val = Y_val,
      criterion = criterion,
      optimizer = optimizer,
      epochs = 5
)

Epoch 000 | Loss: 0.1313 | Val AUC: 0.9403
Epoch 001 | Loss: 0.1271 | Val AUC: 0.9406
Epoch 002 | Loss: 0.1230 | Val AUC: 0.9409
Epoch 003 | Loss: 0.1188 | Val AUC: 0.9412
Epoch 004 | Loss: 0.1148 | Val AUC: 0.9414


# 6) Evaluate MLP model

Τέλος βγαζουμε το **AUC** για το **test set** και **validation set**

In [25]:
def evaluate(model, X, Y):
    model.eval()
    with torch.no_grad():
        logits = model(X)
        probs = torch.sigmoid(logits)
        auc = roc_auc_score(Y.cpu().numpy(), probs.cpu().numpy())
    return auc

# Final evaluation
val_auc = evaluate(model_mlp, X_val, Y_val)
test_auc = evaluate(model_mlp, X_test, Y_test)

print("Final Val AUC:", val_auc)
print("Final Test AUC:", test_auc)

Final Val AUC: 0.941364618084801
Final Test AUC: 0.9517283150688791
