In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HR_edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 1.9416282176971436


In [4]:
# Pagerank scores list and normalized scores list creation
bet_n = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_n.append((bet[str(i)] - min_v) / (max_v - min_v))

In [5]:
obj = pd.read_pickle("./HR.pickled")

# Pagerank scores classification

## Interval size 0.05

In [16]:
node_labels = []
for bet_v in bet_n:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [17]:
pd.Series(node_labels).value_counts()

1     33183
2     16824
3      3428
4       739
5       216
6        93
7        41
8        23
9        10
10        7
11        3
12        2
20        2
13        1
15        1
dtype: int64

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
norm_f1_scores = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))

    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 1.8630142211914062
Completed iteration in 10.07777714729309
Completed iteration in 20.439276218414307
Completed iteration in 30.209795236587524
Completed iteration in 32.71586871147156
Completed iteration in 34.38589692115784
Completed iteration in 32.41056799888611
Completed iteration in 33.71754240989685
Completed iteration in 35.42132115364075
Completed iteration in 32.788265228271484
Completed iteration in 32.64391899108887
Completed iteration in 32.24184799194336
Completed iteration in 34.07928657531738
Completed iteration in 33.41324710845947
Completed iteration in 34.79125952720642
Completed iteration in 32.29947519302368
Completed iteration in 32.53772807121277
Completed iteration in 31.96491289138794
Completed iteration in 31.979134798049927
Completed iteration in 32.86782884597778
Completed iteration in 30.906912088394165


### Heterogeneous pt1

In [19]:
node_labels = []
for bet_v in bet_n:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [20]:
pd.Series(node_labels).value_counts()

11    4941
12    4221
16    4167
4     3936
5     3841
3     3718
6     3702
7     3592
13    3379
2     3265
8     3182
9     2905
10    2717
14    2513
1     2325
15    1770
17     399
dtype: int64

In [21]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.8900508880615234
Iteration completed in 11.135456085205078
Iteration completed in 21.731714487075806
Iteration completed in 35.84219694137573
Iteration completed in 40.106719732284546
Iteration completed in 43.95417380332947
Iteration completed in 42.817848682403564
Iteration completed in 41.02334189414978
Iteration completed in 39.19133496284485
Iteration completed in 41.52335739135742
Iteration completed in 40.12421703338623
Iteration completed in 40.92211675643921
Iteration completed in 38.6001718044281
Iteration completed in 37.67082619667053
Iteration completed in 35.78271269798279
Iteration completed in 36.111549377441406
Iteration completed in 39.48486542701721
Iteration completed in 40.33783578872681
Iteration completed in 40.44921278953552
Iteration completed in 42.15825176239014
Iteration completed in 42.246198654174805


### Heterogeneous pt2

In [22]:
node_labels = []
for bet_v in bet_n:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [23]:
pd.Series(node_labels).value_counts()

17    10852
16     9571
18     9154
15     7382
14     5082
19     3578
13     3124
12     1853
11     1103
20      935
10      712
9       416
8       248
21      207
7       147
6        84
22       50
5        38
4        17
23        7
1         5
3         4
2         2
24        2
dtype: int64

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores3 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores3.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.0193185806274414
Iteration completed in 12.308707475662231
Iteration completed in 24.042680978775024
Iteration completed in 35.77869987487793
Iteration completed in 39.712952613830566
Iteration completed in 41.069998025894165
Iteration completed in 43.74008750915527
Iteration completed in 42.0943546295166
Iteration completed in 42.35682010650635
Iteration completed in 42.444491386413574
Iteration completed in 42.5054931640625
Iteration completed in 44.201284646987915
Iteration completed in 44.90153455734253
Iteration completed in 49.603527784347534
Iteration completed in 46.77486729621887
Iteration completed in 45.499577045440674
Iteration completed in 44.63865852355957
Iteration completed in 44.6585636138916
Iteration completed in 45.07160806655884
Iteration completed in 45.91462516784668
Iteration completed in 46.38460564613342


## Results

### Normalized PageRank

In [28]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.8200641319285386, 1)

In [29]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.273843334860284, 1)

In [30]:
max(norm_f1_scores3), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores3.index(max(norm_f1_scores3))]

(0.4238204306000916, 1)

### Graphs creation

In [31]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [33]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, norm_f1_scores3, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [35]:
### PageRank distrib function
serie = pd.Series(bet_n)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()
