In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
for (x, y) in G.edges():
    if(x == y):
        if(H.has_node(x)):
            nodes.append(x)
            H.remove_node(x)

In [5]:
len(list(H.nodes()))

765

In [6]:
G = H
nx.is_weakly_connected(G)

True

In [7]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Calculated scores in: {time.time() - i_time}")

Calculated scores in: 0.13226914405822754


In [27]:
bet_n = []
for i in sorted(list(map(lambda x: int(x), bet.keys()))):
    bet_n.append((bet[str(i)] - min(bet.values())) / (max(bet.values()) - min(bet.values())))

In [13]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

# Pagerank scores classification

## Interval size 0.05

In [45]:
node_labels = []
for bet_v in bet_n:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [46]:
pd.Series(node_labels).value_counts()

1     311
2     179
3      77
4      54
5      44
6      28
7      19
8      14
9      10
10      8
12      5
11      5
14      4
13      4
19      2
20      1
dtype: int64

In [47]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
norm_f1_scores = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))

    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 0.11564350128173828
Completed iteration in 0.14266180992126465
Completed iteration in 0.1748969554901123
Completed iteration in 0.20197176933288574
Completed iteration in 0.18392062187194824
Completed iteration in 0.18391680717468262
Completed iteration in 0.18683409690856934
Completed iteration in 0.18981075286865234
Completed iteration in 0.18062353134155273
Completed iteration in 0.1803896427154541
Completed iteration in 0.17335939407348633
Completed iteration in 0.18363595008850098
Completed iteration in 0.17490816116333008
Completed iteration in 0.18254804611206055
Completed iteration in 0.18082952499389648
Completed iteration in 0.17921137809753418
Completed iteration in 0.18009591102600098
Completed iteration in 0.17979145050048828
Completed iteration in 0.16464996337890625
Completed iteration in 0.21290874481201172
Completed iteration in 0.16701078414916992


### Heterogeneous pt1

In [48]:
node_labels = []
for bet_v in bet_n:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [49]:
pd.Series(node_labels).value_counts()

17    144
16    131
11     58
2      42
1      42
9      35
12     35
5      34
13     32
15     31
7      31
8      29
3      28
6      28
14     23
10     22
4      20
dtype: int64

In [50]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.14362788200378418
Iteration completed in 0.14458918571472168
Iteration completed in 0.1886427402496338
Iteration completed in 0.18133902549743652
Iteration completed in 0.19334006309509277
Iteration completed in 0.18716931343078613
Iteration completed in 0.18946552276611328
Iteration completed in 0.18743276596069336
Iteration completed in 0.22008633613586426
Iteration completed in 0.18845200538635254
Iteration completed in 0.19660210609436035
Iteration completed in 0.20094704627990723
Iteration completed in 0.21674442291259766
Iteration completed in 0.18311166763305664
Iteration completed in 0.19223785400390625
Iteration completed in 0.19068217277526855
Iteration completed in 0.21754240989685059
Iteration completed in 0.21634817123413086
Iteration completed in 0.21010828018188477
Iteration completed in 0.18625903129577637
Iteration completed in 0.21389031410217285


### Heterogeneous pt2

In [51]:
node_labels = []
for bet_v in bet_n:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [52]:
pd.Series(node_labels).value_counts()

17    108
18     97
16     91
20     78
19     77
21     64
15     57
22     40
13     31
14     28
12     24
23     18
11     15
9       9
1       7
10      7
8       4
24      3
7       3
6       2
5       1
4       1
dtype: int64

In [53]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores3 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores3.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.10855674743652344
Iteration completed in 0.15168380737304688
Iteration completed in 0.18837690353393555
Iteration completed in 0.1974184513092041
Iteration completed in 0.1962146759033203
Iteration completed in 0.21288299560546875
Iteration completed in 0.20731115341186523
Iteration completed in 0.20488786697387695
Iteration completed in 0.19309639930725098
Iteration completed in 0.1912086009979248
Iteration completed in 0.19417500495910645
Iteration completed in 0.18761253356933594
Iteration completed in 0.18911242485046387
Iteration completed in 0.1867382526397705
Iteration completed in 0.18962311744689941
Iteration completed in 0.19047045707702637
Iteration completed in 0.1946113109588623
Iteration completed in 0.184891939163208
Iteration completed in 0.19367098808288574
Iteration completed in 0.18454957008361816
Iteration completed in 0.19213318824768066


## Results

### Normalized PageRank

In [57]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.5751633986928104, 2)

In [58]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.41830065359477125, 1)

In [59]:
max(norm_f1_scores3), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores3.index(max(norm_f1_scores3))]

(0.33986928104575165, 1)

### Graphs creation

In [60]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [62]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, norm_f1_scores3, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [64]:
### PageRank distrib function
serie = pd.Series(bet_n)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()
