In [3]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pickle
import pandas as pd
import os
import networkx as nx
from pandas import HDFStore,DataFrame
from pandas import read_hdf
import gc
from tqdm import tqdm

In [4]:
data = pd.read_csv('E:/GSU_Study/COURSE_SCHEDULE/Fall-24/Graph Analytics/project/train.csv')
   

In [5]:
data.head(2)

Unnamed: 0,source_node,destination_node
0,1,690569
1,1,315892


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9437519 entries, 0 to 9437518
Data columns (total 2 columns):
 #   Column            Dtype
---  ------            -----
 0   source_node       int64
 1   destination_node  int64
dtypes: int64(2)
memory usage: 144.0 MB


In [8]:
duplicates = data.duplicated().sum()


In [9]:
duplicates

0

In [10]:
missing_values = data.isnull().sum().sum()
missing_values

0

In [11]:
g = nx.from_pandas_edgelist(data, 'source_node', 'destination_node', create_using=nx.DiGraph())

In [12]:

import scipy.sparse as sp  

G = g  # Keep as directed graph
G_undirected = G.to_undirected()

# Create a mapping of node IDs to indices
node_id_to_index = {node: idx for idx, node in enumerate(G.nodes())}

# Adjust the PageRank function to return a dictionary
def approximate_pagerank(adj_matrix, alpha=0.85, tol=1e-6, max_iter=100):
    n = adj_matrix.shape[0]
    # Convert to float for division operation
    adj_matrix = sp.csr_matrix(adj_matrix, dtype=float)
    # Normalize the adjacency matrix for PageRank computation
    row_sums = np.array(adj_matrix.sum(axis=1)).flatten()
    row_indices, col_indices = adj_matrix.nonzero()
    adj_matrix.data /= row_sums[row_indices]

    # Initialize PageRank vector
    x = np.ones(n) / n
    # Personalization vector
    p = np.ones(n) / n
    # Dangling nodes (nodes with no outgoing links)
    dangling_weights = np.where(row_sums == 0)[0]

    for _ in range(max_iter):
        x_last = x
        # Calculate PageRank
        x = alpha * (adj_matrix.dot(x) + (x[dangling_weights].sum() / n)) + (1 - alpha) * p
        # Check convergence
        if np.linalg.norm(x - x_last, ord=1) < tol:
            break
            
    return {node: rank for node, rank in zip(G.nodes(), x)}

# Compute PageRank centrality
adj_matrix = nx.adjacency_matrix(G_undirected)
pagerank_centrality = approximate_pagerank(adj_matrix)

# Ensure the mapping is correct
node_index_to_id = {idx: node for node, idx in node_id_to_index.items()}
pagerank_centrality = {node_index_to_id[idx]: rank for idx, rank in enumerate(approximate_pagerank(adj_matrix))}

non_edges = nx.non_edges(G_undirected)




In [15]:
import random 
import networkx as nx
from tqdm import tqdm  

# Function to compute features for a pair of nodes
def compute_features(G, G_undirected, edge, jaccard_dict, adamic_adar_dict, pref_attachment_dict, pagerank_centrality, node_id_to_index):
    u, v = edge
    u_idx, v_idx = node_id_to_index[u], node_id_to_index[v]
    features = {
        'commonneighbors': len(list(nx.common_neighbors(G_undirected, u, v))),
        'jaccard_coefficient': jaccard_dict.get((u, v), 0),
        'resource_allocation': adamic_adar_dict.get((u, v), 0),
        'adamic_adar': adamic_adar_dict.get((u, v), 0),
        'preferential': pref_attachment_dict.get((u, v), 0),
        'degree_u': G.degree(u),
        'degree_v': G.degree(v),
        'cluster_u': nx.clustering(G, u),
        'cluster_v': nx.clustering(G, v),
        'nooffollowers_u': G.in_degree(u),
        'nooffollowees_u': G.out_degree(u),
        'nooffollowers_v': G.in_degree(v),
        'nooffollowees_v': G.out_degree(v),
        'pagerank_u': pagerank_centrality.get(u, 0),
        'pagerank_v': pagerank_centrality.get(v, 0),
    }
    return features

# Convert edge view to a list for sampling
edges_list = list(G.edges())

# Sample a smaller number of edges if necessary
sample_size = max(int(len(edges_list) * 0.02), 1)  # Reduced to 1 or 0.5% or adjust as needed
sampled_edges = random.sample(edges_list, sample_size)

G_train = G.copy()
G_train.remove_edges_from(sampled_edges)

# Precompute pairwise metrics
jaccard_dict = {(u, v): p for u, v, p in nx.jaccard_coefficient(G_undirected, sampled_edges)}
adamic_adar_dict = {(u, v): p for u, v, p in nx.adamic_adar_index(G_undirected, sampled_edges)}
pref_attachment_dict = {(u, v): p for u, v, p in nx.preferential_attachment(G_undirected, sampled_edges)}

# Compute features for sampled edges
test_set_with_features = [compute_features(G, G_undirected, edge, jaccard_dict, adamic_adar_dict, pref_attachment_dict, pagerank_centrality, node_id_to_index) 
                          for edge in tqdm(sampled_edges, desc='Computing Features')]


Computing Features: 100%|█████████████████████████████████████████████████████| 188750/188750 [15:58<00:00, 196.91it/s]


In [16]:
import csv
# Write the features to a CSV file instead of printing them
with open('E:/GSU_Study/COURSE_SCHEDULE/Fall-24/Graph Analytics/project/set_with_features.csv', 'w', newline='') as csvfile:
    fieldnames = ['u', 'v'] + list(test_set_with_features[0].keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for edge, features in tqdm(zip(sampled_edges, test_set_with_features), total=len(sampled_edges), desc='Writing Features'):
        row = {'u': edge[0], 'v': edge[1], **features}
        writer.writerow(row)

print("Features have been written to 'test_set_with_features.csv'")

Writing Features: 100%|█████████████████████████████████████████████████████| 188750/188750 [00:08<00:00, 23033.85it/s]

Features have been written to 'test_set_with_features.csv'



