In [3]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [4]:
G = nx.read_edgelist("./RO_edges_norm.csv")

In [15]:
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 0.0015604496002197266


In [16]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [10]:
obj = pd.read_pickle("./RO.pickled")

# Heterogeneous pt1

In [31]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [32]:
pd.Series(node_labels).value_counts()

17    8333
13    5632
1     5430
14    5056
18    4484
15    4404
16    3794
19    3096
20    1055
21     360
22     104
23      22
24       3
dtype: int64

In [33]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.8270151615142822
Iteration completed in 7.2891881465911865
Iteration completed in 13.702331781387329
Iteration completed in 19.48780345916748
Iteration completed in 25.0395405292511
Iteration completed in 27.020980834960938
Iteration completed in 31.0756413936615
Iteration completed in 31.44975757598877
Iteration completed in 32.46914482116699
Iteration completed in 34.3626184463501
Iteration completed in 32.63159799575806
Iteration completed in 34.621395111083984
Iteration completed in 43.67966628074646
Iteration completed in 49.00006604194641
Iteration completed in 49.04688310623169
Iteration completed in 48.65290284156799
Iteration completed in 47.98635268211365
Iteration completed in 43.86401104927063
Iteration completed in 40.58526563644409
Iteration completed in 46.85686755180359
Iteration completed in 42.93748116493225


# Heterogeneous pt2

In [18]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [19]:
pd.Series(node_labels).value_counts()

2     5632
1     5430
4     5056
6     4404
8     3794
10    3257
16    3181
11    2741
12    2335
15    1961
13    1868
14    1479
17     635
dtype: int64

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 1.637575626373291
Iteration completed in 5.341165542602539
Iteration completed in 12.44843077659607
Iteration completed in 17.106887817382812
Iteration completed in 22.20210838317871
Iteration completed in 25.34640908241272
Iteration completed in 28.547587394714355
Iteration completed in 28.984711408615112
Iteration completed in 28.26998996734619
Iteration completed in 27.593394994735718
Iteration completed in 28.389107704162598
Iteration completed in 31.58970022201538
Iteration completed in 34.052584171295166
Iteration completed in 34.714240312576294
Iteration completed in 35.43861794471741
Iteration completed in 38.209107637405396
Iteration completed in 39.55060887336731
Iteration completed in 42.418264389038086
Iteration completed in 49.012184381484985
Iteration completed in 36.67137575149536
Iteration completed in 42.547062158584595


## Results

In [35]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.8090963494913226, 1)

In [24]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.76229802513465, 1)

### Graphs creation

In [34]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [36]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_Degree.png", dpi=500)
plt.show()

  plt.show()


In [41]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 100)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_distrib_Degree.png", dpi=500)

plt.show()

  plt.show()
