# Imports

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import networkit as nk
import networkx as nx
import tqdm
import numpy as np
import glob

from sklearn.metrics import ndcg_score

# Load data

In [88]:
# wikipedia_clickstream_df = pd.concat([pd.read_csv(filepath, sep="\t", on_bad_lines="skip", header=None, names=["prev", "curr", "type", "n"]) for filepath in glob.glob("../data/wikipedia_clickstream/*.tsv")[:2]], ignore_index=True, axis=0)
wikipedia_clickstream_df = pd.read_csv(glob.glob("../data/wikipedia_clickstream/*.tsv")[0], sep="\t", on_bad_lines="skip", header=None, names=["prev", "curr", "type", "n"])
wikipedia_clickstream_df = wikipedia_clickstream_df.loc[wikipedia_clickstream_df["type"]!="external", :]

wikipedia_clickstream_df.head()

Unnamed: 0,prev,curr,type,n
0,Eddie_Albert,The_Dude_Goes_West,link,17
2,Gale_Storm,The_Dude_Goes_West,link,15
5,Ascoli_Calcio_1898_F.C.,Gianluca_Scamacca,link,87
7,2019–20_Coppa_Italia,Gianluca_Scamacca,link,333
8,2018_UEFA_European_Under-19_Championship,Gianluca_Scamacca,link,23


In [206]:
incoming_request_count_df = wikipedia_clickstream_df.groupby("curr")["n"].sum().reset_index().sort_values("n", ascending=False)

incoming_request_count_df.head()

Unnamed: 0,curr,n
1190357,Hyphen-minus,7535327
1661220,Main_Page,5303986
730536,Deaths_in_2020,1669864
121179,2019–20_Wuhan_coronavirus_outbreak,1519584
845221,Elizabeth_II,1106447


In [90]:
outgoing_request_count_df = wikipedia_clickstream_df.groupby("prev")["n"].sum().reset_index().sort_values("n", ascending=False)

outgoing_request_count_df.head()

Unnamed: 0,prev,n
1160018,Main_Page,34908316
994400,Kobe_Bryant,4547308
595206,Elizabeth_II,2467055
1805770,The_Witcher_(TV_series),2372085
515641,Deaths_in_2020,2117707


In [91]:
request_count_df = incoming_request_count_df.merge(outgoing_request_count_df, how="inner", left_on="curr", right_on="prev")
request_count_df["request_count"] = request_count_df["n_x"] + request_count_df["n_y"]

request_count_df.head()

Unnamed: 0,curr,n_x,prev,n_y,request_count
0,Hyphen-minus,7535327,Hyphen-minus,672,7535999
1,Main_Page,5303986,Main_Page,34908316,40212302
2,Deaths_in_2020,1669864,Deaths_in_2020,2117707,3787571
3,2019–20_Wuhan_coronavirus_outbreak,1519584,2019–20_Wuhan_coronavirus_outbreak,1057532,2577116
4,Elizabeth_II,1106447,Elizabeth_II,2467055,3573502


## Node label to id mapping 

In [92]:
%%time

id_label_mapping = {id: label for id, label in enumerate(list(set(wikipedia_clickstream_df["prev"].to_list() + wikipedia_clickstream_df["curr"].to_list())))}
label_id_mapping = {label: id for id, label in id_label_mapping.items()}

CPU times: user 10 s, sys: 1.32 s, total: 11.4 s
Wall time: 11.9 s


# Generate networkit graph

In [93]:
%%time

kn = 10**6
g = nk.Graph(directed=True)

for row in wikipedia_clickstream_df[["prev", "curr"]].to_records(index=False).tolist()[:kn]:
    g.addEdge(label_id_mapping[row[0]], label_id_mapping[row[1]], addMissing=True)

print("Number of nodes: ", g.numberOfNodes())
print("Number of edges: ", g.numberOfEdges())  

Number of nodes:  3021434
Number of edges:  1000000
CPU times: user 10.4 s, sys: 939 ms, total: 11.3 s
Wall time: 11.4 s


# Compare centrality measures

In [94]:
centrality = {}

## Request count centrality

In [95]:
centrality["request_count"] = [label_id_mapping[label] for label in request_count_df["curr"].to_list()]
[id_label_mapping[item] for item in centrality["request_count"][:10]]

['Hyphen-minus',
 'Main_Page',
 'Deaths_in_2020',
 '2019–20_Wuhan_coronavirus_outbreak',
 'Elizabeth_II',
 'Death_of_Kobe_Bryant',
 'Kobe_Bryant',
 'Charles,_Prince_of_Wales',
 'Novel_coronavirus_(2019-nCoV)',
 'George_VI']

## Degree centrality

In [97]:
%%time

centrality["degree"] = nk.centrality.DegreeCentrality(g)
centrality["degree"].run()
[id_label_mapping[item[0]] for item in centrality["degree"].ranking()[:10]]

CPU times: user 1.64 s, sys: 326 ms, total: 1.97 s
Wall time: 812 ms


['Main_Page',
 'Deaths_in_2020',
 'Wikipedia',
 'Wiki',
 'Italy',
 'United_States',
 'New_York_City',
 'Russia',
 'California',
 'Iran']

## Betweenness centrality

In [21]:
# %%time

# centrality["betweenness"] = nk.centrality.Betweenness(g)
# centrality["betweenness"].run()
# centrality["betweenness"].ranking()[:10]

## Closeness centrality

In [98]:
%%time

centrality["closeness"] = nk.centrality.Closeness(g, True, nk.centrality.ClosenessVariant.Generalized)
centrality["closeness"].run()
[id_label_mapping[item[0]] for item in centrality["closeness"].ranking()[:10]]

CPU times: user 9min 42s, sys: 4.89 s, total: 9min 47s
Wall time: 58.9 s


['Main_Page',
 'Wiki',
 'Wikipedia',
 'Deaths_in_2020',
 'English_Wikipedia',
 'United_Kingdom',
 'United_States',
 'Kobe_Bryant',
 'France',
 'Italy']

## Top k closeness centrality

In [100]:
%%time

centrality["topkcloseness"] = nk.centrality.TopCloseness(g, k=10000, first_heu=False, sec_heu=False)
centrality["topkcloseness"].run()
[id_label_mapping[item] for item in centrality["topkcloseness"].topkNodesList()[:10]]

CPU times: user 5min 59s, sys: 4.31 s, total: 6min 3s
Wall time: 36.3 s


['Main_Page',
 'Wiki',
 'Wikipedia',
 'Deaths_in_2020',
 'English_Wikipedia',
 'United_Kingdom',
 'United_States',
 'Kobe_Bryant',
 'France',
 'Italy']

## Pagerank centrality

In [101]:
%%time

centrality["pagerank"] = nk.centrality.PageRank(g, damp=0.85, tol=1e-9)
centrality["pagerank"].run()
[id_label_mapping[item[0]] for item in centrality["pagerank"].ranking()[:10]]

CPU times: user 13.7 s, sys: 357 ms, total: 14 s
Wall time: 1.85 s


['California',
 'United_States_dollar',
 'Semi-arid_climate',
 'Subtropics',
 'Sweden',
 'Hinduism',
 "Grey's_Anatomy",
 'Joseph_Stalin',
 'Michael_Jackson',
 'Gross_domestic_product']

# NDCG comparison

In [151]:
%%time
k=10

y_true = centrality["request_count"]
y_score = centrality["topkcloseness"].topkNodesList()[:k]
y_true_df = pd.DataFrame({"id": y_true, "y_true": list(range(len(y_true)))})
y_score_df = pd.DataFrame({"id": y_score, "y_score": list(range(len(y_score)))})

y_df = y_true_df.merge(y_score_df, how="left", on="id").sort_values("y_score")

# y_score = y_df["y_true"]
# y_true = list(range(k))
# print(len(y_score))

# ndcg_score([y_true], [y_score])

y_df.head(20)

CPU times: user 1.68 s, sys: 213 ms, total: 1.89 s
Wall time: 1.9 s


Unnamed: 0,id,y_true,y_score
1,151706,1,0.0
46565,577919,46565,1.0
3716,803883,3716,2.0
2,2644187,2,3.0
90755,2761182,90755,4.0
88,2449197,88,5.0
18,365704,18,6.0
6,904589,6,7.0
897,2143436,897,8.0
1378,2901869,1378,9.0


In [201]:
k = 100

# y_true = [row[0] for row in centrality["pagerank"].ranking()]
y_true = centrality["request_count"]

y_score = [y_true.index(item) if item in y_true else 10**7 for item in centrality["topkcloseness"].topkNodesList()[:k]]
# y_score = [y_true.index(item[0]) if item[0] in y_true else 10**7 for item in centrality["pagerank"].ranking()[:k]]
# y_score = [y_true.index(item[0]) if item[0] in y_true else 10**7 for item in centrality["degree"][:k]]

# print(y_true[:k])
# print(centrality["topkcloseness"].topkNodesList()[:k])
# print([item[0] for item in centrality["closeness"].ranking()[:k]])

y_true = list(range(k))

# print(y_true)
# print(y_score)

ndcg_score([y_true], [y_score])

0.8944402617678682

k=1000
- closeness/topkcloseness 0.9341286632589697
- pagerank 0.9301915444115204
- degree 0.9090587081639641

k=100
- degree 0.9031601692381107
- topkcloseness/closeness 0.8944402617678682
- pagerank 0.8914683669356246

k=10
-
- topkcloseness/closeness 0.7668082612680364
- degree 0.7643151136589202
- pagerank 0.712779381324142

In [None]:
k = 50

# y_true = [row[0] for row in centrality["eigenvector"].ranking()]
y_true = centrality["revenue"]

# y_score = [y_true.index(item) for item in centrality["topkcloseness"].topkNodesList()[:k]]
# y_score = [y_true.index(item[0]) for item in centrality["betweenness"].ranking()[:k]]
y_score = [y_true.index(item[0]) for item in centrality["degree"][:k]]

print(y_true[:k])
# print(centrality["topkcloseness"].topkNodesList()[:k])
print(centrality["pagerank"].ranking()[:k])

y_true = range(k)

print(y_true)
print(y_score)

ndcg_score([y_true], [y_score])

## w.r.t top-k closeness

In [18]:
ndcg_scores = {}

for k in [5, 10, 20, 30, 40, 50]:

    ndcg_scores[k] = {}

    y_score = [centrality["topkcloseness"].topkNodesList()[:k]]

    for centrality_measure in ["degree", "closeness", "pagerank"]:

        if centrality_measure == "outdegree":
            y_true = [[row[0] for row in centrality[centrality_measure][:k]]]

        else:
            y_true = [[row[0] for row in centrality[centrality_measure].ranking()[:k]]]
        
        ndcg_scores[k][centrality_measure] = ndcg_score(y_true, y_score, k=k)

ndcg_scores_df = pd.DataFrame(ndcg_scores).T

ndcg_scores_df

Unnamed: 0,degree,closeness,pagerank
5,1.0,1.0,0.950353
10,1.0,1.0,0.946001
20,1.0,1.0,0.814444
30,1.0,1.0,0.764063
40,1.0,1.0,0.639165
50,1.0,1.0,0.626371


# Top request count locations

In [229]:
request_count = {}

for k in [100, 1000, 5000, 10000, 20000]:

    request_count[k] = {}

    for centrality_measure in ["degree", "closeness", "topkcloseness", "pagerank"]:

        if centrality_measure == "topkcloseness":
            request_count[k][centrality_measure] = incoming_request_count_df.loc[incoming_request_count_df["curr"].isin([id_label_mapping[item] for item in centrality["topkcloseness"].topkNodesList()[:k]]), ["n"]].sum()["n"]

        else:
            request_count[k][centrality_measure] = incoming_request_count_df.loc[incoming_request_count_df["curr"].isin([id_label_mapping[item[0]] for item in centrality[centrality_measure].ranking()[:k]]), ["n"]].sum()["n"]
        
pd.DataFrame(request_count).T

Unnamed: 0,degree,closeness,topkcloseness,pagerank
100,14969167,19920576,19920576,4231931
1000,60372286,72029762,72029762,16034082
5000,176276887,170515346,170515346,39524921
10000,275039394,247091183,247091183,53275542
20000,409079290,367573679,247091183,69011896


In [207]:
incoming_request_count_df = incoming_request_count_df.sort_values("n", ascending=True)
incoming_request_count_df["n_cumsum"] = incoming_request_count_df["n"].cumsum()
incoming_request_count_df = incoming_request_count_df.sort_values("n", ascending=False)


In [228]:
incoming_request_count_df.loc[incoming_request_count_df["n_cumsum"] >= incoming_request_count_df["n_cumsum"].quantile(0.9), ["n"]].sum()

n    1712587614
dtype: int64