# Imports

In [11]:
import pandas as pd 
import matplotlib.pyplot as plt
import networkit as nk
import tqdm
import numpy as np
import glob
import time
import json

from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MinMaxScaler, normalize, LabelBinarizer
from scipy.stats import kendalltau, spearmanr

# Load data

In [12]:
Euroroads_df = pd.read_csv("../data/Euroroads/subelj_euroroad/out.subelj_euroroad_euroroad", sep=" ", on_bad_lines="skip", index_col=False, header=None, names=["source", "target"])[1:]

Euroroads_df["source"] = Euroroads_df["source"].astype(int)
Euroroads_df["target"] = Euroroads_df["target"].astype(int)

Euroroads_df.head()

  Euroroads_df = pd.read_csv("../data/Euroroads/subelj_euroroad/out.subelj_euroroad_euroroad", sep=" ", on_bad_lines="skip", index_col=False, header=None, names=["source", "target"])[1:]


Unnamed: 0,source,target
1,1,2
2,2,3
3,2,17
4,3,4
5,4,5


# Generate networkit graph

In [13]:
g = nk.Graph(directed=False)

for row in Euroroads_df[["source", "target"]].to_records(index=False).tolist():
    g.addEdge(row[0], row[1], addMissing=True)

g.removeSelfLoops()

print("Number of nodes: ", g.numberOfNodes())
print("Number of edges: ", g.numberOfEdges())  

Number of nodes:  1175
Number of edges:  1417


# Network metrics

In [14]:
results = {
            "dataset": "Euroroads",
            "directed": False,
            "nodes": g.numberOfNodes(),
            "edges": g.numberOfEdges()
        }

# Density
results["density"] = nk.graphtools.density(g)
print(f"Density: ", results["density"])

# Average clustering coefficient
results["average_clustering_coefficient"] = nk.globals.ClusteringCoefficient().avgLocal(g, 10**6) 
print(f"Average clustering coefficient: ", results["average_clustering_coefficient"])

# Diameter
diameter = nk.distance.Diameter(g, algo=nk.distance.DiameterAlgo.Exact, nSamples=10**5)
diameter.run()
results["diameter"] = diameter.getDiameter() 
print(f"Diameter: ", results["diameter"])

Density:  0.0020544419877487406
Average clustering coefficient:  0.019962253193960503
Diameter:  (62, 0)


# Centrality measures

In [15]:
def get_degree_centrality(g):
    start_time = time.process_time()
    
    degree = nk.centrality.DegreeCentrality(g)
    degree.run()
    
    end_time = time.process_time()
    
    return degree, (end_time - start_time)

def get_closeness_centrality(g):
    start_time = time.process_time()
    
    closeness = nk.centrality.Closeness(g, True, nk.centrality.ClosenessVariant.Generalized)
    closeness.run()
    
    end_time = time.process_time()
    
    return closeness, (end_time - start_time)

def get_topk_closeness_centrality(g, first_heu=False, second_heu=False, k=5):
    start_time = time.process_time()

    topk_closeness = nk.centrality.TopCloseness(g, k=k, first_heu=first_heu, sec_heu=second_heu)
    topk_closeness.run()
    
    end_time = time.process_time()

    return topk_closeness, (end_time - start_time)

centrality = {}
results["time_elapsed"] = {}

# Degree centrality
centrality["degree"], results["time_elapsed"]["degree"] = get_degree_centrality(g)

# Closeness centrality
centrality["closeness"], results["time_elapsed"]["closeness"] = get_closeness_centrality(g)

# Topk closeness centrality
ks = [5, 10, 50, 100, int(results["nodes"]/2)]

centrality["topkcloseness_0"] = {}
centrality["topkcloseness_1"] = {}
results["time_elapsed"]["topkcloseness_0"] = {}
results["time_elapsed"]["topkcloseness_1"] = {}

for k in ks:
    centrality["topkcloseness_0"][k], results["time_elapsed"]["topkcloseness_0"][k] = get_topk_closeness_centrality(g, False, False, k)
    centrality["topkcloseness_1"][k], results["time_elapsed"]["topkcloseness_1"][k] = get_topk_closeness_centrality(g, False, True, k)

In [16]:
results

{'dataset': 'Euroroads',
 'directed': False,
 'nodes': 1175,
 'edges': 1417,
 'density': 0.0020544419877487406,
 'average_clustering_coefficient': 0.019962253193960503,
 'diameter': (62, 0),
 'time_elapsed': {'degree': 0.0009870000000002932,
  'closeness': 0.029200000000001225,
  'topkcloseness_0': {5: 0.03622499999999995,
   10: 0.011738999999998612,
   50: 0.03205499999999972,
   100: 0.028202999999999534,
   587: 0.04675399999999996},
  'topkcloseness_1': {5: 0.11434600000000117,
   10: 0.03481700000000032,
   50: 0.026867000000001084,
   100: 0.02474499999999935,
   587: 0.059626999999998986}}}

# Experiments

## Preprocess

In [17]:
normalised_scores = {}
nodes = {}

# Label binarizer for encoding of node labels
lb = LabelBinarizer()
lb.fit(np.arange(results["nodes"]))

# Degree centrality
scaler = MinMaxScaler()
normalised_scores["degree"] = scaler.fit_transform(np.array([row[1] for row in centrality["degree"].ranking()]).reshape(-1, 1)).flatten()
nodes["degree"] = [row[0] for row in centrality["degree"].ranking()]

# Closeness centrality
scaler = MinMaxScaler()
normalised_scores["closeness"] = scaler.fit_transform(np.array([row[1] for row in centrality["closeness"].ranking()]).reshape(-1, 1)).flatten()
nodes["closeness"] = [row[0] for row in centrality["closeness"].ranking()]

# Topk closeness centrality
normalised_scores["topkcloseness_0"], normalised_scores["topkcloseness_1"] = {}, {}
nodes["topkcloseness_0"], nodes["topkcloseness_1"] = {}, {}

for k in ks:
    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_0"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_0"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_0"][k] = centrality["topkcloseness_0"][k].topkNodesList()

    scaler = MinMaxScaler()
    normalised_scores["topkcloseness_1"][k] = scaler.fit_transform(np.array(centrality["topkcloseness_1"][k].topkScoresList()).reshape(-1, 1)).flatten()
    nodes["topkcloseness_1"][k] = centrality["topkcloseness_1"][k].topkNodesList()

## NDCG degree to (top-k) closeness

In [18]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        y_true = lb.transform(nodes["degree"][:k])
        sample_weight = normalised_scores["degree"][:k]
        
        if "topk" in centrality_measure:
            y_score = lb.transform(nodes[centrality_measure][k])
            
        else:
            y_score = lb.transform(nodes[centrality_measure][:k])

        ndcg_scores[k][centrality_measure] = ndcg_score(y_true, y_score, sample_weight=sample_weight)
    
results["ndcg_degree"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.118774,0.118774,0.118774
10,0.118774,0.118774,0.118774
50,0.118774,0.118774,0.118774
100,0.118774,0.118774,0.118774
587,0.120203,0.120203,0.120203


## NDCG closeness to (top-k) closeness

In [19]:
ndcg_scores = {}

for k in ks:
    ndcg_scores[k] = {}
    
    y_score = lb.transform(nodes["closeness"][:k])
    sample_weight = normalised_scores["closeness"][:k]
    
    for centrality_measure in ["topkcloseness_0", "topkcloseness_1"]:
        y_true = lb.transform(nodes[centrality_measure][k])
            
        ndcg_scores[k][centrality_measure] = ndcg_score(y_true, y_score, sample_weight=sample_weight)
    
results["ndcg_closeness"] = ndcg_scores

pd.DataFrame(ndcg_scores).T

Unnamed: 0,topkcloseness_0,topkcloseness_1
5,0.82702,1.0
10,1.0,1.0
50,1.0,1.0
100,1.0,1.0
587,1.0,1.0


## Spearman rank correlation

In [20]:
spearmanr_corr = {}

for k in ks:
    spearmanr_corr[k] = {}
    
    y_true = pd.DataFrame({"nodes": nodes["degree"][:k], "ranking_x": normalised_scores["degree"][:k]})

    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = pd.DataFrame({"nodes": nodes[centrality_measure][k], "ranking_y": normalised_scores[centrality_measure][k]})
            
        else:
            y_score = pd.DataFrame({"nodes": nodes[centrality_measure][:k], "ranking_y": normalised_scores[centrality_measure][:k]})

        y_df = y_true.merge(y_score, how="outer", on="nodes").fillna(0)

        corr, _ = spearmanr(y_df["ranking_x"], y_df["ranking_y"])
        spearmanr_corr[k][centrality_measure] = corr
    
results["spearmanr"] = spearmanr_corr

pd.DataFrame(spearmanr_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,-0.893427,-0.754851,-0.754851
10,-0.708726,-0.76761,-0.76761
50,-0.619558,-0.605842,-0.605842
100,-0.521785,-0.515057,-0.515057
587,0.041347,0.042241,0.042241


## Kendall tau rank correlation

In [21]:
kendalltau_corr = {}

for k in ks:
    kendalltau_corr[k] = {}
    
    y_true = pd.DataFrame({"nodes": nodes["degree"][:k], "ranking_x": normalised_scores["degree"][:k]})

    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        
        if "topk" in centrality_measure:
            y_score = pd.DataFrame({"nodes": nodes[centrality_measure][k], "ranking_y": normalised_scores[centrality_measure][k]})
            
        else:
            y_score = pd.DataFrame({"nodes": nodes[centrality_measure][:k], "ranking_y": normalised_scores[centrality_measure][:k]})

        y_df = y_true.merge(y_score, how="outer", on="nodes").fillna(0)

        corr, _ = kendalltau(y_df["ranking_x"], y_df["ranking_y"])
        kendalltau_corr[k][centrality_measure] = 0 if np.isnan(corr) else corr
    
results["kendalltau"] = kendalltau_corr

pd.DataFrame(kendalltau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,-0.784706,-0.678064,-0.678064
10,-0.531158,-0.619857,-0.619857
50,-0.507625,-0.497229,-0.497229
100,-0.417802,-0.412583,-0.412583
587,0.021783,0.022562,0.022562


# Store results

In [22]:
with open(f"../results/{results['dataset']}_{results['directed']}.json", 'w') as outfile:
    json.dump(results, outfile)