In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./RO_edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 0.612062931060791


In [4]:
# Pagerank scores list and normalized scores list creation
bet_n = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_n.append((bet[str(i)] - min_v) / (max_v - min_v))

In [5]:
obj = pd.read_pickle("./RO.pickled")

# Pagerank scores classification

## Interval size 0.05

In [15]:
node_labels = []
for bet_v in bet_n:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [16]:
pd.Series(node_labels).value_counts()

1     18472
2     14612
3      5594
4      1696
5       704
6       341
7       145
8        73
9        48
10       32
11       14
12       13
13       10
14        5
15        4
20        3
16        3
17        2
19        1
18        1
dtype: int64

In [17]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
norm_f1_scores = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))

    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 1.6845498085021973
Completed iteration in 6.405450105667114
Completed iteration in 14.026698112487793
Completed iteration in 19.941468000411987
Completed iteration in 25.082165479660034
Completed iteration in 28.70567774772644
Completed iteration in 30.221662521362305
Completed iteration in 31.023065090179443
Completed iteration in 30.357406616210938
Completed iteration in 31.18939232826233
Completed iteration in 32.140002727508545
Completed iteration in 32.65641474723816
Completed iteration in 30.76059079170227
Completed iteration in 30.867648363113403
Completed iteration in 32.4676456451416
Completed iteration in 33.55215930938721
Completed iteration in 32.675471782684326
Completed iteration in 35.75158476829529
Completed iteration in 33.62837219238281
Completed iteration in 34.549468994140625
Completed iteration in 33.90553379058838


### Heterogeneous pt1

In [23]:
node_labels = []
for bet_v in bet_n:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [24]:
pd.Series(node_labels).value_counts()

16    7290
11    3756
12    3328
13    3035
14    2429
15    2064
4     2016
6     1992
10    1980
9     1969
5     1941
8     1858
7     1853
2     1819
3     1740
17    1399
1     1304
dtype: int64

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.7782933712005615
Iteration completed in 7.044949531555176
Iteration completed in 16.580552339553833
Iteration completed in 20.913383722305298
Iteration completed in 25.97273898124695
Iteration completed in 30.266244649887085
Iteration completed in 33.568459272384644
Iteration completed in 34.36983561515808
Iteration completed in 34.535499811172485
Iteration completed in 34.872642278671265
Iteration completed in 33.99198365211487
Iteration completed in 35.45360708236694
Iteration completed in 36.010438442230225
Iteration completed in 34.506654500961304
Iteration completed in 35.47956395149231
Iteration completed in 35.802680253982544
Iteration completed in 35.63083004951477
Iteration completed in 35.38776612281799
Iteration completed in 36.49047327041626
Iteration completed in 37.310572385787964
Iteration completed in 37.46139311790466


### Heterogeneous pt2

In [20]:
node_labels = []
for bet_v in bet_n:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [21]:
pd.Series(node_labels).value_counts()

18    8601
17    8135
19    5757
16    5562
15    3844
14    2579
20    2221
13    1314
12    1109
11     874
21     766
10     475
22     197
9      173
8       75
23      46
7       18
24      10
6        7
4        4
5        3
1        2
3        1
dtype: int64

In [22]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores3 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores3.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.6730988025665283
Iteration completed in 6.778089761734009
Iteration completed in 15.121624946594238
Iteration completed in 20.475942373275757
Iteration completed in 27.3701913356781
Iteration completed in 28.647382736206055
Iteration completed in 34.40398359298706
Iteration completed in 35.45874381065369
Iteration completed in 32.5433623790741
Iteration completed in 33.40751361846924
Iteration completed in 33.90727996826172
Iteration completed in 34.198978424072266
Iteration completed in 37.26548767089844
Iteration completed in 40.50982332229614
Iteration completed in 37.189000606536865
Iteration completed in 36.91785550117493
Iteration completed in 37.281524896621704
Iteration completed in 38.44658827781677
Iteration completed in 38.72681713104248
Iteration completed in 34.723713397979736
Iteration completed in 36.6832435131073


## Results

### Normalized PageRank

In [29]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.7119090365050867, 1)

In [30]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.3090365050867744, 1)

In [31]:
max(norm_f1_scores3), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores3.index(max(norm_f1_scores3))]

(0.3862357869539198, 1)

### Graphs creation

In [56]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [58]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, norm_f1_scores3, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [60]:
### PageRank distrib function
serie = pd.Series(bet_n)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()
