#Link prediction

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import networkx as nx
import pandas as pd
import numpy as np

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
def load_embedding(file):
    # Load NetMF embedding from .npy file
    return np.load(file)

In [60]:
def link_prediction(embedding, edges, non_edges):

    # Extract embeddings for the given edges and non-edges
    emb_edges = np.array([embedding[edge[0]] * embedding[edge[1]] for edge in edges])
    emb_non_edges = np.array([embedding[non_edge[0]] * embedding[non_edge[1]] for non_edge in non_edges])

    print('iteration done')

    # Label the edges as 1 and non-edges as 0
    labels = np.concatenate([np.ones(len(edges)), np.zeros(len(non_edges))])

    # Combine edge and non-edge embeddings
    X = np.concatenate([emb_edges, emb_non_edges])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.5, random_state=42)

    print('split done')

    # Normalize the inner product by the sigmoid function
    inner_product = np.sum(X_test, axis=1)
    normalized_similarity = 1 / (1 + np.exp(-inner_product))

    print('similarities calculated')

    # Compute ROC-AUC score
    roc_auc = roc_auc_score(y_test, normalized_similarity)

    return roc_auc


#Sample edges and non-edges for karate club

In [61]:
# # Generate the karate club graph
# karate_club_graph = nx.karate_club_graph()

# # Get the edges and non-edges
# edges = list(karate_club_graph.edges())
# all_possible_edges = list(nx.non_edges(karate_club_graph))

# # Sample an equal number of non-edges for a balanced dataset
# non_edges = np.random.choice(range(len(all_possible_edges)), size=len(edges), replace=False)
# non_edges = [all_possible_edges[i] for i in non_edges]

#Sample edges and non-edges for other datasets

In [3]:
# Set parameters directly
folder_path = "datasets/BlogCatalog-dataset/"

nodes_file = folder_path + "data/nodes.csv"
edges_file = folder_path + "data/edges.csv"
groups_file = folder_path + "data/groups.csv"
group_edges_file = folder_path + "data/group-edges.csv"
output_file = "/content/drive/MyDrive/NetMF_implementations/output_blogcatalog_large"

nodes_id = pd.read_csv(nodes_file, header=None, names=['id'])
groups_id = pd.read_csv(groups_file, header=None, names=['group'])
edges = pd.read_csv(edges_file, header=None, names=['id_1', 'id_2'])
user_group_membership = pd.read_csv(group_edges_file, header=None, names=['id', 'group'])


In [8]:
# Load edges from the edges.csv file
edges_df = pd.read_csv(edges_file, header=None, names=['node1', 'node2'])

# Adjust node indices to start from 0
edges_df['node1'] -= 1
edges_df['node2'] -= 1

# Create a graph using the edges
graph = nx.Graph()
graph.add_edges_from(edges_df.values)

# Get the edges and non-edges
edges = list(graph.edges())
all_possible_edges = list(nx.non_edges(graph))

# Sample an equal number of non-edges for a balanced dataset
non_edges = np.random.choice(range(len(all_possible_edges)), size=len(edges), replace=False)
non_edges = [all_possible_edges[i] for i in non_edges]

In [14]:
len(all_possible_edges)

52829533

In [63]:
# Replace 'netmf_embedding.npy' with the actual path to your NetMF embedding file
embedding_file = '/content/drive/MyDrive/NetMF_implementations/output_blogcatalog_large.npy'
netmf_embedding = load_embedding(embedding_file)

# # Example: Assume 'edges' contains positive examples and 'non_edges' contains negative examples
# edges = [(0, 1), (2, 3), ...]  # Replace with your actual positive examples
# non_edges = [(4, 5), (6, 7), ...]  # HOW TO GET NON_EDGES OF HUGE FILE DO I NEED TO GENERATE THEM MYSELF???

roc_auc_score = link_prediction(netmf_embedding, edges, non_edges)
print(f"ROC-AUC Score: {roc_auc_score}")

#TODO: current ROC-AUC score is too low, need to re-think method...

iteration done
split done
similarities calculated
ROC-AUC Score: 0.6652234954734094
