# Link Prediction

Importing dataset and ceating a directed graph

In [None]:
import pandas as pd
import networkx as nx

# Loading subgraph CSV file into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/BayesDiff/subgraph.csv')
df

Unnamed: 0,source,target
0,7843,17781
1,7843,62478
2,7843,77999
3,7843,96745
4,7843,120708
...,...,...
359109,51317,240101
359110,51317,249270
359111,51317,252528
359112,51317,255700


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Creating a new empty graph
G = nx.Graph()

# Adding edges from DataFrame to the graph
for index, row in df.iterrows():
    G.add_edge(row['source'], row['target'])

# Printing the number of nodes and edges in the graph
print('Number of nodes =', G.number_of_nodes())
print('Number of edges =', G.number_of_edges())

Number of nodes = 13142
Number of edges = 318785


**Adamic Adar Link Prediction**

In [None]:
from sklearn.model_selection import train_test_split
from networkx.algorithms import link_prediction

# Converting the directed graph to an undirected graph
undirected_graph = G.to_undirected()

# Spliting the graph into training and test sets
train_set, test_set = train_test_split(list(undirected_graph.edges()), test_size=0.2)

# Generating a list of missing edges to predict
edges_missing = list(nx.non_edges(undirected_graph))

# Calculating the Adamic-Adar index for each missing edge
adamic_scores = link_prediction.adamic_adar_index(undirected_graph, edges_missing)

# Converting the generator object to a dictionary and sort the scores in descending order
adamic_scores_dict = {(u, v): s for u, v, s in adamic_scores}
sorted_scores = sorted(adamic_scores_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
print("Number of edges in the test set =", len(test_set))

Number of edges in the test set = 63757


In [None]:
# Printing the top 10 predictions
print("Top 10 link predictions =")
for edge, score in sorted_scores[:10]:
    print(f"{edge}: {score}")

Top 10 link predictions =
(84906, 90543): 372.82350339947175
(82476, 151707): 342.7012835279808
(3164, 82476): 342.67065371310366
(82476, 90543): 342.5657754187254
(3164, 204315): 327.74201826681434
(204315, 82476): 327.57088925184775
(235570, 204315): 327.32246375344926
(204315, 151707): 327.32246375344926
(204315, 259439): 327.32246375344926
(204315, 90543): 327.1869556441939


**Preferential attachment**

In [None]:
# Calculating the preferential attachment score for each missing edge

undirected_graph = G.to_undirected()
edges_missing = list(nx.non_edges(undirected_graph))
pref_att_scores = {}
for u, v in edges_missing:
    pref_att_scores[(u, v)] = list(nx.preferential_attachment(undirected_graph, [(u, v)]))[0][2]

# Sorting the scores in descending order
sorted_scores = sorted(pref_att_scores.items(), key=lambda x: x[1], reverse=True)

# Printing the top 10 predictions
print("Top 10 link predictions =")
for edge, score in sorted_scores[:10]:
    print(f"{edge}: {score}")

Top 10 link predictions =
(137632, 226411): 11249070
(77999, 226411): 11249070
(176790, 226411): 11249070
(17781, 226411): 11249070
(181701, 226411): 11249070
(247241, 226411): 11249070
(183004, 226411): 11249070
(120708, 226411): 11249070
(221087, 226411): 11249070
(62478, 226411): 11249070


Graph Neural Networks

In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data

data = pd.read_csv('/content/drive/MyDrive/SMA/subgraph.csv')

# Converting data to torch tensor
edge_index = torch.tensor(data.values.T, dtype=torch.long)

# Assuming edge_index is already defined and available
num_nodes = edge_index.max().item() + 1  # Maximum node index + 1

# Initializing node features as the degree of each node
node_features = torch.zeros((num_nodes, 1))
for edge in edge_index.t():
    node_features[edge[0]] += 1
    node_features[edge[1]] += 1  # Count degree for both directions if the graph is undirected

# Creating graph data
graph_data = Data(x=node_features, edge_index=edge_index)

In [None]:
import torch
from torch_geometric.utils import negative_sampling
from torch_geometric.data import Data

# Assuming edge_index and num_nodes are already defined
# Spliting edges into training and testing sets
num_edges = edge_index.size(1)
num_training_edges = int(num_edges * 0.9)  # 90% of edges for training

# Shuffle edges to randomize
perm = torch.randperm(num_edges)
train_edge_index = edge_index[:, perm[:num_training_edges]]
test_edge_index = edge_index[:, perm[num_training_edges:]]

# Generating negative samples
train_neg_edge_index = negative_sampling(edge_index=train_edge_index, num_nodes=num_nodes, num_neg_samples=num_training_edges)
test_neg_edge_index = negative_sampling(edge_index=test_edge_index, num_nodes=num_nodes, num_neg_samples=test_edge_index.size(1))

train_data = Data(x=node_features, edge_index=train_edge_index, neg_edge_index=train_neg_edge_index)
test_data = Data(x=node_features, edge_index=test_edge_index, neg_edge_index=test_neg_edge_index)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, BatchNorm, SAGEConv, GATConv

class LinkPredictor(nn.Module):
    def __init__(self, num_features, hidden_channels, out_channels, dropout=0.5):
        super(LinkPredictor, self).__init__()
        self.conv1 = GATConv(num_features, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.dropout2 = nn.Dropout(dropout)
        self.conv3 = GATConv(hidden_channels, out_channels)
        self.bn3 = BatchNorm(out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.conv2(x, edge_index)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.conv3(x, edge_index)))
        return x

    def decode(self, z, pos_edge_index, neg_edge_index):
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)
        return logits

# Initializing the model with modified hyperparameters
model1 = LinkPredictor(num_features=1, hidden_channels=64, out_channels=32, dropout=0.5)
model2 = LinkPredictor(num_features=1, hidden_channels=64, out_channels=32, dropout=0.5)
model3 = LinkPredictor(num_features=1, hidden_channels=64, out_channels=32, dropout=0.5)

Evaluation for Model trained with learning rate 0.001

In [None]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR

optimizer = torch.optim.Adam(model1.parameters(), lr=0.001, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

def train(model, data, neg_edge_index, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)  # Use passed 'data' instead of 'graph_data'
    pos_edge_index = data.edge_index
    logits = model.decode(z, pos_edge_index, neg_edge_index)
    labels = torch.cat([torch.ones(pos_edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim=0)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

# Training the model
for epoch in range(100):
    loss = train(model1, train_data, train_neg_edge_index, optimizer, criterion)
    print(f'Epoch {epoch+1}: Loss {loss}')

Epoch 1: Loss 0.49148327112197876
Epoch 2: Loss 0.4968384802341461
Epoch 3: Loss 0.49441033601760864
Epoch 4: Loss 0.4919246435165405
Epoch 5: Loss 0.488951712846756
Epoch 6: Loss 0.49987584352493286
Epoch 7: Loss 0.48839327692985535
Epoch 8: Loss 0.485343873500824
Epoch 9: Loss 0.4806666970252991
Epoch 10: Loss 0.47513872385025024
Epoch 11: Loss 0.47523462772369385
Epoch 12: Loss 0.47916725277900696
Epoch 13: Loss 0.47987622022628784
Epoch 14: Loss 0.4689079523086548
Epoch 15: Loss 0.47166675329208374
Epoch 16: Loss 0.4692313075065613
Epoch 17: Loss 0.46379536390304565
Epoch 18: Loss 0.4748222529888153
Epoch 19: Loss 0.4742421507835388
Epoch 20: Loss 0.47800949215888977
Epoch 21: Loss 0.4696563482284546
Epoch 22: Loss 0.47109082341194153
Epoch 23: Loss 0.47279343008995056
Epoch 24: Loss 0.460290789604187
Epoch 25: Loss 0.4601135551929474
Epoch 26: Loss 0.46894270181655884
Epoch 27: Loss 0.453843891620636
Epoch 28: Loss 0.4518945813179016
Epoch 29: Loss 0.4566744565963745
Epoch 30: Los

In [None]:
from sklearn.metrics import roc_auc_score

def evaluate(model, data, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model(data.x, data.edge_index)  # Use the 'data' parameter here
        logits = model.decode(z, data.edge_index, neg_edge_index)
        labels = torch.cat([torch.ones(data.edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim=0)
        predictions = torch.sigmoid(logits)
        return roc_auc_score(labels.cpu(), predictions.cpu())

# Evaluate on Test Set
test_auc_score = evaluate(model1, test_data, test_neg_edge_index)
print(f'Test AUC-ROC score: {test_auc_score}')

Test AUC-ROC score: 0.9992324546888733


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def predict_links(model, data, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model(data.x, data.edge_index)
        logits = model.decode(z, data.edge_index, neg_edge_index)
        labels = torch.cat([torch.ones(data.edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim=0)
        predictions = torch.sigmoid(logits) > 0.5  # Threshold probabilities to make class predictions
        accuracy = accuracy_score(labels.cpu(), predictions.cpu())
        f1 = f1_score(labels.cpu(), predictions.cpu())
        return accuracy, f1

accuracy, f1 = predict_links(model1, test_data, test_neg_edge_index)
print(f'Test Accuracy: {accuracy}')
print(f'Test F1-Score: {f1}')

Test Accuracy: 0.5
Test F1-Score: 0.6666666666666666


Evaluation for Model trained with learning rate 0.1

In [None]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR

optimizer = torch.optim.Adam(model2.parameters(), lr=0.1, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training the model
for epoch in range(30):
    loss = train(model2, train_data, train_neg_edge_index, optimizer, criterion)
    print(f'Epoch {epoch+1}: Loss {loss}')

Epoch 1: Loss 0.4975561797618866
Epoch 2: Loss 0.4949691891670227
Epoch 3: Loss 0.4654850661754608
Epoch 4: Loss 0.47137463092803955
Epoch 5: Loss 0.48189088702201843
Epoch 6: Loss 0.45912256836891174
Epoch 7: Loss 0.44138336181640625
Epoch 8: Loss 0.4125173091888428
Epoch 9: Loss 0.40346458554267883
Epoch 10: Loss 0.39251941442489624
Epoch 11: Loss 0.39245346188545227
Epoch 12: Loss 0.39126673340797424
Epoch 13: Loss 0.38956138491630554
Epoch 14: Loss 0.3812798261642456
Epoch 15: Loss 0.3777826130390167
Epoch 16: Loss 0.37613847851753235
Epoch 17: Loss 0.3785461187362671
Epoch 18: Loss 0.3813540041446686
Epoch 19: Loss 0.3766584098339081
Epoch 20: Loss 0.37339168787002563
Epoch 21: Loss 0.37293559312820435
Epoch 22: Loss 0.3743266463279724
Epoch 23: Loss 0.374691367149353
Epoch 24: Loss 0.3739543557167053
Epoch 25: Loss 0.3713071048259735
Epoch 26: Loss 0.36970165371894836
Epoch 27: Loss 0.3702576458454132
Epoch 28: Loss 0.37113043665885925
Epoch 29: Loss 0.3733729124069214
Epoch 30: 

In [None]:
# Evaluate on Test Set
test_auc_score = evaluate(model2, test_data, test_neg_edge_index)
print(f'Test AUC-ROC score: {test_auc_score}')

Test AUC-ROC score: 0.9844577640835713


In [None]:
accuracy, f1 = predict_links(model2, test_data, test_neg_edge_index)
print(f'Test Accuracy: {accuracy}')
print(f'Test F1-Score: {f1}')

Test Accuracy: 0.983946870126977
Test F1-Score: 0.9837153792918379


Evaluation for Model trained with learning rate 0.01

In [None]:
from torch.optim.lr_scheduler import StepLR, ExponentialLR

optimizer = torch.optim.Adam(model3.parameters(), lr=0.01, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training the model
for epoch in range(50):
    loss = train(model3, train_data, train_neg_edge_index, optimizer, criterion)
    print(f'Epoch {epoch+1}: Loss {loss}')

Epoch 1: Loss 0.6146199107170105
Epoch 2: Loss 0.5487397313117981
Epoch 3: Loss 0.5105056762695312
Epoch 4: Loss 0.4663807451725006
Epoch 5: Loss 0.44090044498443604
Epoch 6: Loss 0.4251956641674042
Epoch 7: Loss 0.41701608896255493
Epoch 8: Loss 0.4006879925727844
Epoch 9: Loss 0.39632636308670044
Epoch 10: Loss 0.38910380005836487
Epoch 11: Loss 0.3826361894607544
Epoch 12: Loss 0.3817039728164673
Epoch 13: Loss 0.37705716490745544
Epoch 14: Loss 0.37706509232521057
Epoch 15: Loss 0.37286147475242615
Epoch 16: Loss 0.37023013830184937
Epoch 17: Loss 0.37080004811286926
Epoch 18: Loss 0.37222349643707275
Epoch 19: Loss 0.368070125579834
Epoch 20: Loss 0.3671777844429016
Epoch 21: Loss 0.3683050274848938
Epoch 22: Loss 0.365221232175827
Epoch 23: Loss 0.36521151661872864
Epoch 24: Loss 0.36359187960624695
Epoch 25: Loss 0.3639599680900574
Epoch 26: Loss 0.36288517713546753
Epoch 27: Loss 0.36305779218673706
Epoch 28: Loss 0.36279332637786865
Epoch 29: Loss 0.3626810908317566
Epoch 30: 

In [None]:
# Evaluate on Test Set
test_auc_score = evaluate(model3, test_data, test_neg_edge_index)
print(f'Test AUC-ROC score: {test_auc_score}')

Test AUC-ROC score: 0.999472472353399


In [None]:
accuracy, f1 = predict_links(model3, test_data, test_neg_edge_index)
print(f'Test Accuracy: {accuracy}')
print(f'Test F1-Score: {f1}')

Test Accuracy: 0.9988722432613054
Test F1-Score: 0.998873513663862
