# Imports

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import networkit as nk
import tqdm
import numpy as np
import glob
import time
import json

from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MinMaxScaler, normalize
from scipy.stats import kendalltau

# Load data

In [14]:
Euroroads_df = pd.read_csv("../data/Euroroads/subelj_euroroad/out.subelj_euroroad_euroroad", sep=" ", on_bad_lines="skip", index_col=False, header=None, names=["source", "target"])[1:]

Euroroads_df["source"] = Euroroads_df["source"].astype(int)
Euroroads_df["target"] = Euroroads_df["target"].astype(int)

Euroroads_df.head()

  Euroroads_df = pd.read_csv("../data/Euroroads/subelj_euroroad/out.subelj_euroroad_euroroad", sep=" ", on_bad_lines="skip", index_col=False, header=None, names=["source", "target"])[1:]


Unnamed: 0,source,target
1,1,2
2,2,3
3,2,17
4,3,4
5,4,5


# Generate networkit graph

In [15]:
g = nk.Graph(directed=False)

for row in Euroroads_df[["source", "target"]].to_records(index=False).tolist():
    g.addEdge(row[0], row[1], addMissing=True)

g.removeSelfLoops()

print("Number of nodes: ", g.numberOfNodes())
print("Number of edges: ", g.numberOfEdges())  

Number of nodes:  1175
Number of edges:  1417


# Network metrics

In [16]:
results = {
            "dataset": "Euroroads",
            "directed": False,
            "nodes": g.numberOfNodes(),
            "edges": g.numberOfEdges()
        }

# Density
results["density"] = nk.graphtools.density(g)
print(f"Density: ", results["density"])

# Average clustering coefficient
results["average_clustering_coefficient"] = nk.globals.ClusteringCoefficient().avgLocal(g, 10**6) 
print(f"Average clustering coefficient: ", results["average_clustering_coefficient"])

# Diameter
diameter = nk.distance.Diameter(g, algo=nk.distance.DiameterAlgo.Exact, nSamples=10**5)
diameter.run()
results["diameter"] = diameter.getDiameter() 
print(f"Diameter: ", results["diameter"])

Density:  0.0020544419877487406
Average clustering coefficient:  0.019962253193960503
Diameter:  (62, 0)


# Centrality measures

In [17]:
def get_degree_centrality(g):
    start_time = time.process_time()
    
    degree = nk.centrality.DegreeCentrality(g)
    degree.run()
    
    end_time = time.process_time()
    
    return degree, (end_time - start_time)

def get_closeness_centrality(g):
    start_time = time.process_time()
    
    closeness = nk.centrality.Closeness(g, True, nk.centrality.ClosenessVariant.Generalized)
    closeness.run()
    
    end_time = time.process_time()
    
    return closeness, (end_time - start_time)

def get_topk_closeness_centrality(g, first_heu=False, second_heu=False, k=5):
    start_time = time.process_time()

    topk_closeness = nk.centrality.TopCloseness(g, k=k, first_heu=first_heu, sec_heu=second_heu)
    topk_closeness.run()
    
    end_time = time.process_time()

    return topk_closeness, (end_time - start_time)

centrality = {}
results["time_elapsed"] = {}

# Degree centrality
centrality["degree"], results["time_elapsed"]["degree"] = get_degree_centrality(g)

# Closeness centrality
centrality["closeness"], results["time_elapsed"]["closeness"] = get_closeness_centrality(g)

# Topk closeness centrality
ks = [5, 10, 50, 100, int(results["nodes"]/2)]

centrality["topkcloseness_0"] = {}
centrality["topkcloseness_1"] = {}
results["time_elapsed"]["topkcloseness_0"] = {}
results["time_elapsed"]["topkcloseness_1"] = {}

for k in ks:
    centrality["topkcloseness_0"][k], results["time_elapsed"]["topkcloseness_0"][k] = get_topk_closeness_centrality(g, False, False, k)
    centrality["topkcloseness_1"][k], results["time_elapsed"]["topkcloseness_1"][k] = get_topk_closeness_centrality(g, False, True, k)

In [18]:
results

{'dataset': 'Euroroads',
 'directed': False,
 'nodes': 1175,
 'edges': 1417,
 'density': 0.0020544419877487406,
 'average_clustering_coefficient': 0.019962253193960503,
 'diameter': (62, 0),
 'time_elapsed': {'degree': 0.0006770000000004828,
  'closeness': 0.029631999999999437,
  'topkcloseness_0': {5: 0.025293000000000454,
   10: 0.037513000000000574,
   50: 0.027490000000000236,
   100: 0.035954000000000264,
   587: 0.06726799999999944},
  'topkcloseness_1': {5: 0.07376199999999944,
   10: 0.038768000000000136,
   50: 0.02954299999999943,
   100: 0.041510999999999854,
   587: 0.0695440000000005}}}

# Experiments

## Preprocess

In [19]:
normalised_scores = {}
nodes = {}

# Degree centrality
scaler = MinMaxScaler()
normalised_scores["degree"] = scaler.fit_transform(np.array([row[1] for row in centrality["degree"].ranking()]).reshape(-1, 1)).flatten()
nodes["degree"] = [row[0] for row in centrality["degree"].ranking()]

# Closeness centrality
scaler = MinMaxScaler()
normalised_scores["closeness"] = scaler.fit_transform(np.array([row[1] for row in centrality["closeness"].ranking()]).reshape(-1, 1)).flatten()
nodes["closeness"] = [row[0] for row in centrality["closeness"].ranking()]

# Topk closeness centrality
normalised_scores["topkcloseness_0"], normalised_scores["topkcloseness_1"] = {}, {}
nodes["topkcloseness_0"], nodes["topkcloseness_1"] = {}, {}

for k in ks:
    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_0"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_0"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_0"][k] = centrality["topkcloseness_0"][k].topkNodesList()

    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_1"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_1"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_1"][k] = centrality["topkcloseness_1"][k].topkNodesList()

## NDCG degree to (top-k) closeness

In [20]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = np.where(np.isin(nodes[centrality_measure][k], nodes["degree"][:k]), normalised_scores[centrality_measure][k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]
            
        else:
            y_score = np.where(np.isin(nodes[centrality_measure][:k], nodes["degree"][:k]), normalised_scores[centrality_measure][:k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]

        ndcg_scores[k][centrality_measure] = ndcg_score([y_true], [y_score])
    
results["ndcg_degree_to"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.967929,0.967929,0.967929
10,0.98695,0.990945,0.990945
50,0.963015,0.963015,0.963015
100,0.965471,0.965471,0.965471
587,0.989366,0.989366,0.989366


## NDCG (top-k) closeness to degree

In [30]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            # y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][k]

        else:
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][:k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][:k]
            
        ndcg_scores[k][centrality_measure] = ndcg_score([y_true], [y_score])
    
results["ndcg_to_degree"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.994882,0.78268,0.78268
10,0.995952,0.848762,0.848762
50,0.994673,0.900524,0.900524
100,0.994355,0.919283,0.919283
587,0.997014,0.982327,0.982327


## Kendall tau degree to (top k) closeness centrality correlation

In [31]:
kendalltau_corr = {}

for k in ks:
    kendalltau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = np.where(np.isin(nodes[centrality_measure][k], nodes["degree"][:k]), normalised_scores[centrality_measure][k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]
            
        else:
            y_score = np.where(np.isin(nodes[centrality_measure][:k], nodes["degree"][:k]), normalised_scores[centrality_measure][:k], np.zeros(k))
            y_true = normalised_scores["degree"][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendalltau_corr[k][centrality_measure] = 0 if np.isnan(corr) else corr
    
results["kendalltau_degree_to"] = kendalltau_corr

pd.DataFrame(kendalltau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.0,0.0,0.0
10,0.180151,0.557086,0.557086
50,0.207727,0.207727,0.207727
100,0.116883,0.116883,0.116883
587,0.498489,0.498489,0.498489


## Kendall tau (top k) closeness to degree centrality correlation

In [32]:
kendalltau_corr = {}

for k in ks:
    kendalltau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            # y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][k]

        else:
            y_score = np.where(np.isin(nodes["degree"][:k], nodes[centrality_measure][:k]), normalised_scores["degree"][:k], np.zeros(k))
            y_true = normalised_scores[centrality_measure][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendalltau_corr[k][centrality_measure] = 0 if np.isnan(corr) else corr
    
results["kendalltau_to_degree"] = kendalltau_corr

pd.DataFrame(kendalltau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.0,0.0,0.0
10,0.036155,0.036155,0.036155
50,0.225296,0.225296,0.225296
100,0.276136,0.276136,0.276136
587,0.497815,0.497815,0.497815


# Store results

In [33]:
with open(f"../results/{results['dataset']}_{results['directed']}.json", 'w') as outfile:
    json.dump(results, outfile)