In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HU_edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 0.8145973682403564


In [4]:
# Pagerank scores list and normalized scores list creation
bet_n = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_n.append((bet[str(i)] - min_v) / (max_v - min_v))

In [5]:
obj = pd.read_pickle("./HU.pickled")

# Pagerank scores classification

## Interval size 0.05

In [15]:
node_labels = []
for bet_v in bet_n:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [16]:
pd.Series(node_labels).value_counts()

2     15024
1     11661
3     10785
4      5871
5      2354
6       942
7       434
8       223
9       114
10       49
11       27
12       22
13       13
14        6
16        5
17        4
15        2
20        1
18        1
dtype: int64

In [17]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
norm_f1_scores = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))

    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 1.7748687267303467
Completed iteration in 9.1596200466156
Completed iteration in 20.185736894607544
Completed iteration in 28.030609846115112
Completed iteration in 33.580870389938354
Completed iteration in 36.02917981147766
Completed iteration in 37.82589101791382
Completed iteration in 37.22967219352722
Completed iteration in 35.683276414871216
Completed iteration in 38.26102089881897
Completed iteration in 42.33114528656006
Completed iteration in 39.41176223754883
Completed iteration in 38.67051911354065
Completed iteration in 38.61998152732849
Completed iteration in 37.82751822471619
Completed iteration in 40.290876626968384
Completed iteration in 44.553669929504395
Completed iteration in 40.329017877578735
Completed iteration in 40.49765372276306
Completed iteration in 40.54937672615051
Completed iteration in 45.29426860809326


### Heterogeneous pt1

In [23]:
node_labels = []
for bet_v in bet_n:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [24]:
pd.Series(node_labels).value_counts()

16    16656
17     4197
11     3126
13     3078
12     3068
14     2932
15     2820
10     1532
9      1504
8      1459
7      1389
6      1294
5      1189
4      1009
3       956
2       811
1       518
dtype: int64

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.9600882530212402
Iteration completed in 10.136090517044067
Iteration completed in 21.275561332702637
Iteration completed in 27.37365961074829
Iteration completed in 33.61381387710571
Iteration completed in 39.58879542350769
Iteration completed in 40.209710121154785
Iteration completed in 41.11030721664429
Iteration completed in 40.86767220497131
Iteration completed in 39.59511375427246
Iteration completed in 40.40566897392273
Iteration completed in 39.29148864746094
Iteration completed in 39.591275215148926
Iteration completed in 38.879714012145996
Iteration completed in 39.227482318878174
Iteration completed in 39.178181886672974
Iteration completed in 40.65025234222412
Iteration completed in 40.97738075256348
Iteration completed in 40.59031558036804
Iteration completed in 35.34926247596741
Iteration completed in 37.66551756858826


### Heterogeneous pt2

In [26]:
node_labels = []
for bet_v in bet_n:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [27]:
pd.Series(node_labels).value_counts()

19    10817
18     9691
20     7536
17     6811
16     4195
15     2391
21     2341
14     1298
13      771
22      503
12      464
11      289
10      213
9        89
23       70
8        29
24       11
7        10
6         4
5         2
1         1
2         1
4         1
dtype: int64

In [28]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores3 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores3.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.326378583908081
Iteration completed in 9.791656970977783
Iteration completed in 19.065901517868042
Iteration completed in 22.826948642730713
Iteration completed in 31.638131141662598
Iteration completed in 34.222681760787964
Iteration completed in 36.211503982543945
Iteration completed in 33.04715704917908
Iteration completed in 30.694661617279053
Iteration completed in 32.75796127319336
Iteration completed in 37.432308197021484
Iteration completed in 30.04254961013794
Iteration completed in 30.124574422836304
Iteration completed in 36.018792390823364
Iteration completed in 34.3275945186615
Iteration completed in 38.22939109802246
Iteration completed in 38.42059803009033
Iteration completed in 38.035507917404175
Iteration completed in 35.89392614364624
Iteration completed in 32.172141313552856
Iteration completed in 33.731141567230225


## Results

### Normalized PageRank

In [32]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.6182162389566681, 1)

In [33]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.43605384938998737, 1)

In [34]:
max(norm_f1_scores3), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores3.index(max(norm_f1_scores3))]

(0.4587715607909129, 1)

### Graphs creation

In [35]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [37]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, norm_f1_scores3, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [50]:
### PageRank distrib function
serie = pd.Series(bet_n)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()
