In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [5]:
len(list(G.nodes()))

778

In [7]:
i_time = time.time()
bet = nx.degree(G)
print(f"Calculated degrees in: {time.time() - i_time}")

Calculated degrees in: 0.00011229515075683594


In [8]:
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in sorted(list(map(lambda x: int(x), list(G.nodes())))):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [9]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

### Heterogeneous 1

In [10]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [11]:
pd.Series(node_labels).value_counts()

22    140
21    139
20    113
19    100
23     80
18     60
17     36
16     29
15     27
24     20
13     10
14      9
12      5
11      4
9       4
1       2
dtype: int64

In [12]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.14876961708068848
Iteration completed in 0.1455073356628418
Iteration completed in 0.20317316055297852
Iteration completed in 0.1983191967010498
Iteration completed in 0.20823407173156738
Iteration completed in 0.20447468757629395
Iteration completed in 0.18030881881713867
Iteration completed in 0.17904067039489746
Iteration completed in 0.17850303649902344
Iteration completed in 0.18823027610778809
Iteration completed in 0.173598051071167
Iteration completed in 0.19966387748718262
Iteration completed in 0.19762802124023438
Iteration completed in 0.20221972465515137
Iteration completed in 0.16446924209594727
Iteration completed in 0.1712474822998047
Iteration completed in 0.1674344539642334
Iteration completed in 0.17160797119140625
Iteration completed in 0.16859078407287598
Iteration completed in 0.18669605255126953
Iteration completed in 0.18907546997070312


### Heterogeneous 2

In [13]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [14]:
pd.Series(node_labels).value_counts()

17    409
16    179
14     28
13     20
11     20
5      19
3      13
12     13
15     12
8      12
7      11
1      10
6       8
9       8
4       6
2       5
10      5
dtype: int64

In [15]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores2 = []
for res in obj:
    start_time = time.time()
     
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.10262012481689453
Iteration completed in 0.1337568759918213
Iteration completed in 0.19056129455566406
Iteration completed in 0.18924164772033691
Iteration completed in 0.17859578132629395
Iteration completed in 0.1719827651977539
Iteration completed in 0.1826319694519043
Iteration completed in 0.18019509315490723
Iteration completed in 0.17810511589050293
Iteration completed in 0.16753911972045898
Iteration completed in 0.18167471885681152
Iteration completed in 0.19401240348815918
Iteration completed in 0.16983985900878906
Iteration completed in 0.16329264640808105
Iteration completed in 0.1760258674621582
Iteration completed in 0.16498637199401855
Iteration completed in 0.17139244079589844
Iteration completed in 0.18423962593078613
Iteration completed in 0.17844223976135254
Iteration completed in 0.16362571716308594
Iteration completed in 0.17998862266540527


In [16]:
max(f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[f1_scores.index(max(f1_scores))]

(0.2948717948717949, 10)

In [17]:
max(f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[f1_scores2.index(max(f1_scores2))]

(0.6089743589743589, 10)

### Graphs creation

In [18]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [19]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_Degree.png", dpi=500)
plt.show()

  plt.show()


In [20]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 100)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_distrib_Degree.png", dpi=500)

plt.show()

  plt.show()
