In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [5]:
len(list(G.nodes()))

778

In [6]:
i_time = time.time()
bet = G.in_degree()
print(f"Calculated degrees in: {time.time() - i_time}")

Calculated degrees in: 0.00013637542724609375


In [7]:
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in sorted(list(map(lambda x: int(x), list(G.nodes())))):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [8]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

### Heterogeneous pt2

In [9]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [10]:
pd.Series(node_labels).value_counts()

19    142
20    124
18    118
21     80
17     65
22     62
16     54
15     32
23     29
13     15
14     14
12     13
11     12
1       7
9       6
24      5
dtype: int64

In [11]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.10703802108764648
Iteration completed in 0.12668442726135254
Iteration completed in 0.1705608367919922
Iteration completed in 0.18095874786376953
Iteration completed in 0.17346835136413574
Iteration completed in 0.17583703994750977
Iteration completed in 0.18499970436096191
Iteration completed in 0.1797342300415039
Iteration completed in 0.17625641822814941
Iteration completed in 0.16943836212158203
Iteration completed in 0.18207216262817383
Iteration completed in 0.17421269416809082
Iteration completed in 0.17383146286010742
Iteration completed in 0.17214012145996094
Iteration completed in 0.17267584800720215
Iteration completed in 0.16483330726623535
Iteration completed in 0.18000483512878418
Iteration completed in 0.16820549964904785
Iteration completed in 0.17044281959533691
Iteration completed in 0.1741657257080078
Iteration completed in 0.17400574684143066


### Heterogeneous intervals

In [12]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [13]:
pd.Series(node_labels).value_counts()

16    238
17    202
14     38
12     36
13     34
11     30
15     26
1      25
6      22
5      21
2      20
3      18
9      17
10     15
8      14
7      14
4       8
dtype: int64

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores2 = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.09569883346557617
Iteration completed in 0.12926697731018066
Iteration completed in 0.16817855834960938
Iteration completed in 0.17737388610839844
Iteration completed in 0.17775177955627441
Iteration completed in 0.170914888381958
Iteration completed in 0.17280960083007812
Iteration completed in 0.18328189849853516
Iteration completed in 0.1775367259979248
Iteration completed in 0.1839005947113037
Iteration completed in 0.17738127708435059
Iteration completed in 0.17629384994506836
Iteration completed in 0.1766672134399414
Iteration completed in 0.18181419372558594
Iteration completed in 0.1723480224609375
Iteration completed in 0.17024016380310059
Iteration completed in 0.1717972755432129
Iteration completed in 0.16981029510498047
Iteration completed in 0.16955852508544922
Iteration completed in 0.1720430850982666
Iteration completed in 0.17127203941345215


In [15]:
max(f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[f1_scores.index(max(f1_scores))]

(0.6858974358974359, 1)

In [16]:
max(f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[f1_scores2.index(max(f1_scores2))]

(0.6730769230769231, 1)

### Graphs creation

In [17]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [18]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_INDegree.png", dpi=500)
plt.show()

  plt.show()


In [19]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 100)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_distrib_INDegree.png", dpi=500)

plt.show()

  plt.show()
