In [18]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [19]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [20]:
len(list(G.nodes()))

956

In [21]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [22]:
len(list(G.nodes()))

778

In [23]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Calculated scores in: {time.time() - i_time}")

Calculated scores in: 0.6924364566802979


In [24]:
bet_n = []
for i in sorted(list(map(lambda x: int(x), bet.keys()))):
    bet_n.append((bet[str(i)] - min(bet.values())) / (max(bet.values()) - min(bet.values())))

In [25]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

## Interval size 0.05

In [26]:
node_labels = []
for bet_v in bet_n:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [27]:
pd.Series(node_labels).value_counts()

1     311
2     179
3      82
4      57
5      46
6      27
7      20
8      16
9      10
10      8
12      5
11      5
14      4
13      4
19      2
15      1
20      1
dtype: int64

In [28]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
norm_f1_scores = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))

    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 0.08991384506225586
Completed iteration in 0.10728001594543457
Completed iteration in 0.16416287422180176
Completed iteration in 0.20415496826171875
Completed iteration in 0.19837617874145508
Completed iteration in 0.1885378360748291
Completed iteration in 0.19629597663879395
Completed iteration in 0.16199588775634766
Completed iteration in 0.18377304077148438
Completed iteration in 0.17728853225708008
Completed iteration in 0.15277528762817383
Completed iteration in 0.14177513122558594
Completed iteration in 0.19394803047180176
Completed iteration in 0.20504188537597656
Completed iteration in 0.1781296730041504
Completed iteration in 0.14585256576538086
Completed iteration in 0.15897417068481445
Completed iteration in 0.1841742992401123
Completed iteration in 0.20047426223754883
Completed iteration in 0.18021035194396973
Completed iteration in 0.19020676612854004


### Heterogeneous pt1

In [29]:
node_labels = []
for bet_v in bet_n:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [30]:
pd.Series(node_labels).value_counts()

17    149
16    139
11     58
2      42
1      42
12     36
9      35
5      35
7      34
13     32
3      30
15     30
6      26
8      26
14     23
10     22
4      19
dtype: int64

In [31]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.10025525093078613
Iteration completed in 0.12464237213134766
Iteration completed in 0.15751147270202637
Iteration completed in 0.17337536811828613
Iteration completed in 0.21785473823547363
Iteration completed in 0.20355486869812012
Iteration completed in 0.1801435947418213
Iteration completed in 0.15379595756530762
Iteration completed in 0.16820454597473145
Iteration completed in 0.15874242782592773
Iteration completed in 0.1629347801208496
Iteration completed in 0.17173504829406738
Iteration completed in 0.20322108268737793
Iteration completed in 0.21280503273010254
Iteration completed in 0.17417287826538086
Iteration completed in 0.16783857345581055
Iteration completed in 0.15378904342651367
Iteration completed in 0.1814873218536377
Iteration completed in 0.20117402076721191
Iteration completed in 0.20382046699523926
Iteration completed in 0.20479631423950195


### Heterogeneous pt2

In [32]:
node_labels = []
for bet_v in bet_n:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [33]:
pd.Series(node_labels).value_counts()

17    107
18    100
16     90
20     84
19     79
21     64
15     59
22     41
13     31
14     28
12     24
23     19
11     14
9       9
10      8
1       7
8       4
24      3
7       3
6       2
5       1
4       1
dtype: int64

In [34]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores3 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores3.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.09142708778381348
Iteration completed in 0.11371183395385742
Iteration completed in 0.1513843536376953
Iteration completed in 0.16904163360595703
Iteration completed in 0.18618059158325195
Iteration completed in 0.2247021198272705
Iteration completed in 0.22622370719909668
Iteration completed in 0.17183327674865723
Iteration completed in 0.17145204544067383
Iteration completed in 0.16556358337402344
Iteration completed in 0.18062615394592285
Iteration completed in 0.15452170372009277
Iteration completed in 0.20538330078125
Iteration completed in 0.21867012977600098
Iteration completed in 0.18267560005187988
Iteration completed in 0.17493152618408203
Iteration completed in 0.15075325965881348
Iteration completed in 0.1664445400238037
Iteration completed in 0.15489792823791504
Iteration completed in 0.16481471061706543
Iteration completed in 0.155320405960083


## Results

### Normalized PageRank

In [38]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.6089743589743589, 1)

In [39]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.38461538461538464, 1)

In [40]:
max(norm_f1_scores3), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores3.index(max(norm_f1_scores3))]

(0.358974358974359, 1)

### Graphs creation

In [41]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [42]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, norm_f1_scores3, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [43]:
### PageRank distrib function
serie = pd.Series(bet_n)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()
