# Imports

In [48]:
import pandas as pd 
import matplotlib.pyplot as plt
import networkit as nk
import tqdm
import numpy as np
import glob
import sys

from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MinMaxScaler, normalize
from scipy.stats import kendalltau

# Load data

In [10]:
with open('../data/rt-retweet-crawl/rt-retweet-crawl.mtx') as f:
    next(f)
    next(f)
    
    retweets_df = pd.DataFrame([line.split("\n")[0].split(" ") for line in f.readlines()], columns=["source", "target"])
    
retweets_df["source"] = retweets_df["source"].astype(int)
retweets_df["target"] = retweets_df["target"].astype(int)

retweets_df.head()

Unnamed: 0,source,target
0,2182,1
1,8079,1
2,11313,1
3,24832,1
4,28164,1


# Generate networkit graph

In [11]:
%%time

kn = 10**4
g = nk.Graph(directed=False)

for row in retweets_df[["source", "target"]].to_records(index=False).tolist()[:kn]:
    g.addEdge(row[0], row[1], addMissing=True)

print("Number of nodes: ", g.numberOfNodes())
print("Number of edges: ", g.numberOfEdges()) 

Number of nodes:  1112623
Number of edges:  10000
CPU times: user 470 ms, sys: 145 ms, total: 615 ms
Wall time: 614 ms


# Network metrics

## Density

In [146]:
print(f"Density: ", nk.graphtools.density(g))

Density:  1.61560176607997e-08


## Average clustering coefficient

In [151]:
print(f"Average clustering coefficient: ", nk.globals.ClusteringCoefficient().exactGlobal(g))

Average clustering coefficient:  0.0


## Diameter

In [152]:
print(f"Diameter: ", nk.distance.Diameter(g).getDiameter())

Diameter:  (0, 0)


# Centrality measures

In [12]:
centrality = {}

## Degree centrality

In [13]:
%%time

centrality["degree"] = nk.centrality.DegreeCentrality(g)
centrality["degree"].run()
centrality["degree"].ranking()[:10]

CPU times: user 1.04 s, sys: 97.1 ms, total: 1.14 s
Wall time: 276 ms


[(1381, 160.0),
 (447, 134.0),
 (1231, 127.0),
 (452, 121.0),
 (1399, 111.0),
 (856, 92.0),
 (901, 86.0),
 (124, 85.0),
 (472, 85.0),
 (617, 85.0)]

## Closeness centrality

In [14]:
%%time

centrality["closeness"] = nk.centrality.Closeness(g, True, nk.centrality.ClosenessVariant.Generalized)
centrality["closeness"].run()
centrality["closeness"].ranking()[:10]

CPU times: user 2.68 s, sys: 91.6 ms, total: 2.77 s
Wall time: 548 ms


[(918, 0.0009262540522642208),
 (27351, 0.0009145962690635989),
 (390165, 0.0009013610410973633),
 (928, 0.0008675344702188405),
 (196, 0.0008672229160843938),
 (1081871, 0.0008589417297619286),
 (508794, 0.0008556640410853403),
 (856, 0.0008528049316600735),
 (416, 0.0008525733233572953),
 (795, 0.0008517405740928173)]

## Top-k closeness centrality

In [15]:
%%time

centrality["topkcloseness_0"] = nk.centrality.TopCloseness(g, k=10000, first_heu=False, sec_heu=False)
centrality["topkcloseness_0"].run()
centrality["topkcloseness_0"].topkNodesList()[:10]

CPU times: user 3.89 s, sys: 99.5 ms, total: 3.99 s
Wall time: 881 ms


[918, 27351, 390165, 928, 196, 1081871, 508794, 856, 416, 795]

In [16]:
%%time

centrality["topkcloseness_1"] = nk.centrality.TopCloseness(g, k=10000, first_heu=False, sec_heu=True)
centrality["topkcloseness_1"].run()
centrality["topkcloseness_1"].topkNodesList()[:10]

CPU times: user 3min 15s, sys: 39.8 s, total: 3min 55s
Wall time: 40.7 s


[918, 27351, 390165, 928, 196, 1081871, 508794, 856, 416, 795]

# Comparison

## Preprocess

In [133]:
normalised_scores = {}

scaler = MinMaxScaler()
normalised_scores["degree"] = scaler.fit_transform(np.array([row[1] for row in centrality["degree"].ranking()]).reshape(-1, 1)).flatten()

scaler = MinMaxScaler()
normalised_scores["closeness"] = scaler.fit_transform(np.array([row[1] for row in centrality["closeness"].ranking()]).reshape(-1, 1)).flatten()

scaler = MinMaxScaler()
normalised_scores["topkcloseness_0"] = scaler.fit_transform(np.array(centrality["topkcloseness_0"].topkScoresList()).reshape(-1, 1)).flatten()

scaler = MinMaxScaler()
normalised_scores["topkcloseness_1"] = scaler.fit_transform(np.array(centrality["topkcloseness_1"].topkScoresList()).reshape(-1, 1)).flatten()

nodes = {}
nodes["degree"] = [row[0] for row in centrality["degree"].ranking()]
nodes["closeness"] = [row[0] for row in centrality["closeness"].ranking()]
nodes["topkcloseness_0"] = centrality["topkcloseness_0"].topkNodesList()
nodes["topkcloseness_1"] = centrality["topkcloseness_1"].topkNodesList()

## NDCG degree to (top-k) closeness

In [135]:
ndcg_scores = {}

for k in [5, 10, 100, 1000, 10000]:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        y_score = [normalised_scores[centrality_measure][index] if node in nodes["degree"][:k] else 0 for index, node in enumerate(nodes[centrality_measure][:k])]
        y_true = normalised_scores["degree"][:k]

        ndcg_scores[k][centrality_measure] = ndcg_score([y_score], [y_true])
    
pd.DataFrame(ndcg_scores).T

CPU times: user 3.74 s, sys: 29.1 ms, total: 3.77 s
Wall time: 3.77 s


Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.0,0.0,0.0
10,0.301853,0.301853,0.301853
100,0.727953,0.727959,0.727959
1000,0.892102,0.892111,0.892111
10000,0.943927,0.943825,0.943825


## NDCG (top-k) closeness to degree

In [137]:
ndcg_scores = {}

for k in [5, 10, 100, 1000, 10000]:
    ndcg_scores[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
        y_true = normalised_scores[centrality_measure][:k]

        ndcg_scores[k][centrality_measure] = ndcg_score([y_score], [y_true])
    
pd.DataFrame(ndcg_scores).T

CPU times: user 4.28 s, sys: 31.3 ms, total: 4.32 s
Wall time: 4.32 s


Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,0.0,0.0,0.0
10,0.356207,0.356207,0.356207
100,0.516597,0.516597,0.516597
1000,0.724889,0.724889,0.724889
10000,0.994682,0.994682,0.994682


## Kendall tau degree to (top k) closeness centrality correlation

In [140]:
kendall_tau_corr = {}

for k in [5, 10, 100, 1000, 10000]:
    kendall_tau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        y_score = [normalised_scores[centrality_measure][index] if node in nodes["degree"][:k] else 0 for index, node in enumerate(nodes[centrality_measure][:k])]
        y_true = normalised_scores["degree"][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendall_tau_corr[k][centrality_measure] = corr
    
pd.DataFrame(kendall_tau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,,,
10,-0.360041,-0.360041,-0.360041
100,0.213119,0.213119,0.213119
1000,0.310668,0.310668,0.310668
10000,0.322817,0.327122,0.327122


## Kendall tau (top k) closeness to degree centrality correlation

In [141]:
kendall_tau_corr = {}

for k in [5, 10, 100, 1000, 10000]:
    kendall_tau_corr[k] = {}
    
    for centrality_measure in ["closeness", "topkcloseness_0", "topkcloseness_1"]:
        y_score = [normalised_scores["degree"][index] if node in nodes[centrality_measure][:k] else 0 for index, node in enumerate(nodes["degree"][:k])]
        y_true = normalised_scores[centrality_measure][:k]

        corr, _ = kendalltau([y_true], [y_score])
        kendall_tau_corr[k][centrality_measure] = corr
    
pd.DataFrame(kendall_tau_corr).T

Unnamed: 0,closeness,topkcloseness_0,topkcloseness_1
5,,,
10,-0.04969,-0.04969,-0.04969
100,0.068144,0.068144,0.068144
1000,0.15191,0.15191,0.15191
10000,0.205205,0.205206,0.205206
