In [1]:
import pandas as pd
import numpy as np

Imports from utils

In [2]:
from dataprep import generate_samples, reduce_dimensions_sparse_tf_idf, feature_extractor, plot_lower_triangular_corr_matrix, prediction, rooted_pagerank, compute_save_rooted_pagerank_json


Data Import

In [3]:
train_set = pd.read_csv('train.txt', sep=' ', header=None)
test_set = pd.read_csv('test.txt', sep=' ', header=None)

In [4]:
train_set = train_set.rename(columns={0 : 'source_id', 1:'target_id', 2: 'Link'})
test_set = test_set.rename(columns={0:'source_id', 1:'target_id'})

In [5]:
node_features = pd.read_csv('node_information.csv', header=None)


In [6]:
node_features = node_features.rename(columns={0 : 'node_id'})

In [7]:
sorted_nodes = list(node_features['node_id'].unique())

In [8]:
sorted_nodes.sort()

Node Mapping to have consistent and ordered node_id (Need it for GNNs)

In [9]:
node_mapping = {node_id: i for i, node_id in enumerate(sorted_nodes)}

In [10]:
# Step 2: Apply the mapping
node_features['node_id'] = node_features['node_id'].map(node_mapping)
train_set['source_id'] = train_set['source_id'].map(node_mapping)
train_set['target_id'] = train_set['target_id'].map(node_mapping)

In [11]:
test_set['source_id'] = test_set['source_id'].map(node_mapping)
test_set['target_id'] = test_set['target_id'].map(node_mapping)

In [12]:
test_set.columns

Index(['source_id', 'target_id'], dtype='object')

Sanity check for all nodes to be present in node_info

In [13]:
set_nodes_train = set(pd.unique(pd.concat([train_set['source_id'],train_set['target_id']])))
set_nodes_test = set(pd.unique(pd.concat([test_set['source_id'],test_set['target_id']])))
all_nodes = set_nodes_train.union(set_nodes_test)
set_node_feats = set(pd.unique(node_features.node_id))

# Convert to sets for easier comparison
set_all_nodes = set(all_nodes)
set_node_ids_in_features = set_node_feats

# Find missing node IDs
missing_node_ids = set_all_nodes - set_node_ids_in_features

# Check if there are any missing node IDs
if missing_node_ids:
    print(f"Missing node IDs in features: {missing_node_ids}")
else:
    print("All nodes in the train and test datasets have corresponding features.")


All nodes in the train and test datasets have corresponding features.


Creating a Graph

In [14]:
import networkx as nx
from community import community_louvain
from sklearn.model_selection import train_test_split

# Assuming train_set and node_features are already defined and preprocessed as per your previous code.
training_values = train_set.values.tolist()
edges = [(node_pair[0], node_pair[1]) for node_pair in training_values if node_pair[2] == 1]
# Create the graph from the edge list
G = nx.Graph()
G = G.to_undirected()

G.add_nodes_from(all_nodes)
G.add_edges_from(edges)

In [15]:
residual_g, train_samples, train_labels, test_samples, test_labels = generate_samples(graph=G, train_set_ratio=0.8)

In [16]:
compute_save_rooted_pagerank_json(residual_g, train_samples + test_samples, damp=0.85, eps=1e-4, trainval=True)

In [17]:
compute_save_rooted_pagerank_json(G, [tuple(row) for row in test_set.values], damp=0.85, eps=1e-4)

In [18]:
train_samples

[(0, 439),
 (0, 1096),
 (1, 2949),
 (1, 3133),
 (2, 2114),
 (2, 3150),
 (3, 3428),
 (3, 2455),
 (4, 3335),
 (5, 2474),
 (6, 1079),
 (6, 888),
 (6, 2446),
 (6, 1252),
 (6, 1192),
 (6, 1169),
 (6, 2535),
 (6, 2196),
 (7, 2808),
 (7, 1267),
 (7, 2902),
 (7, 2843),
 (8, 487),
 (9, 1343),
 (10, 1991),
 (11, 1991),
 (12, 90),
 (13, 1647),
 (14, 1991),
 (15, 1469),
 (15, 1641),
 (15, 2979),
 (16, 1124),
 (17, 2267),
 (18, 3384),
 (18, 1309),
 (18, 2578),
 (19, 1412),
 (19, 2072),
 (19, 1614),
 (19, 1023),
 (19, 1538),
 (19, 2168),
 (20, 1993),
 (20, 2045),
 (21, 2096),
 (22, 2387),
 (22, 90),
 (22, 814),
 (22, 2554),
 (23, 1991),
 (23, 2923),
 (23, 643),
 (23, 2754),
 (24, 3487),
 (25, 1115),
 (25, 1098),
 (26, 643),
 (27, 1657),
 (27, 1991),
 (27, 1934),
 (28, 763),
 (29, 910),
 (30, 119),
 (30, 2389),
 (31, 155),
 (31, 2828),
 (31, 1991),
 (31, 2452),
 (32, 2605),
 (32, 1991),
 (32, 323),
 (33, 1991),
 (34, 1095),
 (34, 141),
 (35, 521),
 (35, 1726),
 (35, 1676),
 (35, 1965),
 (36, 1048),
 