# Imports

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import networkit as nk
import tqdm
import numpy as np
import glob
import time
import json

from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MinMaxScaler, normalize
from scipy.stats import kendalltau

# Load data

In [2]:
AstroPh_df = pd.read_csv("../data/roadNet-CA/roadNet-CA.txt", sep="\t", on_bad_lines="skip", header=None, names=["source", "target"])[4:]

AstroPh_df["source"] = AstroPh_df["source"].astype(int)
AstroPh_df["target"] = AstroPh_df["target"].astype(int)

AstroPh_df.head()

  AstroPh_df = pd.read_csv("../data/roadNet-CA/roadNet-CA.txt", sep="\t", on_bad_lines="skip", header=None, names=["source", "target"])[4:]


Unnamed: 0,source,target
4,0,1
5,0,2
6,0,469
7,1,0
8,1,6


# Generate networkit graph

In [3]:
g = nk.Graph(directed=False)

for row in AstroPh_df[["source", "target"]].to_records(index=False).tolist():
    g.addEdge(row[0], row[1], addMissing=True)

g.removeSelfLoops()

print("Number of nodes: ", g.numberOfNodes())
print("Number of edges: ", g.numberOfEdges())  

Number of nodes:  1971281
Number of edges:  5533214


# Network metrics

In [4]:
results = {
            "dataset": "roadNet-CA",
            "directed": False,
            "nodes": g.numberOfNodes(),
            "edges": g.numberOfEdges()
        }

# Density
results["density"] = nk.graphtools.density(g)
print(f"Density: ", results["density"])

# Average clustering coefficient
results["average_clustering_coefficient"] = nk.globals.ClusteringCoefficient().avgLocal(g, 10**6) 
print(f"Average clustering coefficient: ", results["average_clustering_coefficient"])

# Diameter
diameter = nk.distance.Diameter(g, algo=nk.distance.DiameterAlgo.Exact, nSamples=10**5)
diameter.run()
results["diameter"] = diameter.getDiameter() 
print(f"Diameter: ", results["diameter"])

Density:  2.847807379255224e-06
Average clustering coefficient:  0.03758879807560322
Diameter:  (865, 0)


# Centrality measures

In [None]:
def get_degree_centrality(g):
    start_time = time.process_time()
    
    degree = nk.centrality.DegreeCentrality(g)
    degree.run()
    
    end_time = time.process_time()
    
    return degree, (end_time - start_time)

def get_closeness_centrality(g):
    start_time = time.process_time()
    
    closeness = nk.centrality.Closeness(g, True, nk.centrality.ClosenessVariant.Generalized)
    closeness.run()
    
    end_time = time.process_time()
    
    return closeness, (end_time - start_time)

def get_topk_closeness_centrality(g, first_heu=False, second_heu=False, k=5):
    start_time = time.process_time()

    topk_closeness = nk.centrality.TopCloseness(g, k=k, first_heu=first_heu, sec_heu=second_heu)
    topk_closeness.run()
    
    end_time = time.process_time()

    return topk_closeness, (end_time - start_time)

centrality = {}
results["time_elapsed"] = {}

# Degree centrality
centrality["degree"], results["time_elapsed"]["degree"] = get_degree_centrality(g)

# Closeness centrality
centrality["closeness"], results["time_elapsed"]["closeness"] = get_closeness_centrality(g)

# Topk closeness centrality
ks = [5, 10, 50, 100, int(results["nodes"]/2)]

centrality["topkcloseness_0"] = {}
centrality["topkcloseness_1"] = {}
results["time_elapsed"]["topkcloseness_0"] = {}
results["time_elapsed"]["topkcloseness_1"] = {}

for k in ks:
    centrality["topkcloseness_0"][k], results["time_elapsed"]["topkcloseness_0"][k] = get_topk_closeness_centrality(g, False, False, k)
    centrality["topkcloseness_1"][k], results["time_elapsed"]["topkcloseness_1"][k] = get_topk_closeness_centrality(g, False, True, k)

In [12]:
results

{'dataset': 'AstroPh',
 'directed': False,
 'nodes': 133280,
 'edges': 396100,
 'density': 4.4597255014071295e-05,
 'average_clustering_coefficient': 0.5590263049576198,
 'diameter': (14, 0),
 'time_elapsed': {'degree': 0.002256000000000924,
  'closeness': 61.602045999999994,
  'topkcloseness_0': {5: 5.090499999999992,
   10: 9.22933900000001,
   50: 19.150551000000007,
   100: 18.060801999999995,
   66640: 200.24156499999998},
  'topkcloseness_1': {5: 39.890799,
   10: 44.497764000000004,
   50: 50.263904,
   100: 52.99988199999996,
   66640: 90.77501600000005}}}

# Experiments

## Preprocess

In [13]:
normalised_scores = {}
nodes = {}

# Degree centrality
scaler = MinMaxScaler()
normalised_scores["degree"] = scaler.fit_transform(np.array([row[1] for row in centrality["degree"].ranking()]).reshape(-1, 1)).flatten()
nodes["degree"] = [row[0] for row in centrality["degree"].ranking()]

# Closeness centrality
scaler = MinMaxScaler()
normalised_scores["closeness"] = scaler.fit_transform(np.array([row[1] for row in centrality["closeness"].ranking()]).reshape(-1, 1)).flatten()
nodes["closeness"] = [row[0] for row in centrality["closeness"].ranking()]

# Topk closeness centrality
normalised_scores["topkcloseness_0"], normalised_scores["topkcloseness_1"] = {}, {}
nodes["topkcloseness_0"], nodes["topkcloseness_1"] = {}, {}

for k in ks:
    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_0"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_0"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_0"][k] = centrality["topkcloseness_0"][k].topkNodesList()

    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_1"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_1"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_1"][k] = centrality["topkcloseness_1"][k].topkNodesList()

## NDCG degree to (top-k) closeness

In [14]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = np.where(np.isin(nodes[centrality_measure][k], nodes["degree"][:k]), normalised_scores[centrality_measure][k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]
            
        else:
            y_score = np.where(np.isin(nodes[centrality_measure][:k], nodes["degree"][:k]), normalised_scores[centrality_measure][:k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]

        ndcg_scores[k][centrality_measure] = ndcg_score([y_true], [y_score])
    
results["ndcg_degree_to"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.99719,0.998672,0.998672
10,0.999914,0.999914,0.999914
50,0.99989,0.99989,0.99989
100,0.999659,0.999659,0.999659
66640,1.0,1.0,1.0


## NDCG (top-k) closeness to degree

In [15]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            # y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][k]

        else:
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][:k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][:k]
            
        ndcg_scores[k][centrality_measure] = ndcg_score([y_true], [y_score])
    
results["ndcg_to_degree"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.999963,0.989838,0.989838
10,0.999928,0.99441,0.99441
50,0.999976,0.999311,0.999311
100,0.999892,0.997184,0.997184
66640,0.999951,0.999951,0.999951


## Kendall tau degree to (top k) closeness centrality correlation

In [16]:
kendalltau_corr = {}

for k in ks:
    kendalltau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = np.where(np.isin(nodes[centrality_measure][k], nodes["degree"][:k]), normalised_scores[centrality_measure][k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]
            
        else:
            y_score = np.where(np.isin(nodes[centrality_measure][:k], nodes["degree"][:k]), normalised_scores[centrality_measure][:k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendalltau_corr[k][centrality_measure] = 0 if np.isnan(corr) else corr
    
results["kendalltau_degree_to"] = kendalltau_corr

pd.DataFrame(kendalltau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.527046,0.83666,0.83666
10,0.966092,0.966092,0.966092
50,0.928878,0.928878,0.928878
100,0.89446,0.89446,0.89446
66640,0.99696,0.99696,0.99696


## Kendall tau (top k) closeness to degree centrality correlation

In [17]:
kendalltau_corr = {}

for k in ks:
    kendalltau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            # y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][k]

        else:
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][:k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendalltau_corr[k][centrality_measure] = 0 if np.isnan(corr) else corr
    
results["kendalltau_to_degree"] = kendalltau_corr

pd.DataFrame(kendalltau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.737865,0.737865,0.737865
10,0.920087,0.920087,0.920087
50,0.929476,0.929476,0.929476
100,0.819089,0.819089,0.819089
66640,0.99696,0.99696,0.99696


# Store results

In [18]:
with open(f"../results/{results['dataset']}_{results['directed']}.json", 'w') as outfile:
    json.dump(results, outfile)