In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HU_edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 0.00014662742614746094


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [5]:
obj = pd.read_pickle("./HU.pickled")

# Heterogeneous pt1

In [11]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [12]:
pd.Series(node_labels).value_counts()

17    9215
19    8755
18    7036
20    3938
14    3683
15    3655
16    3535
13    3411
1     2701
21    1265
22     288
23      49
24       7
dtype: int64

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.128838062286377
Iteration completed in 9.356529951095581
Iteration completed in 20.551236629486084
Iteration completed in 25.821082830429077
Iteration completed in 34.31259632110596
Iteration completed in 36.6932647228241
Iteration completed in 36.86256551742554
Iteration completed in 40.12534213066101
Iteration completed in 37.17947864532471
Iteration completed in 38.52386260032654
Iteration completed in 39.454545736312866
Iteration completed in 40.11328911781311
Iteration completed in 53.7935905456543
Iteration completed in 46.74693036079407
Iteration completed in 54.27530097961426
Iteration completed in 47.217692852020264
Iteration completed in 44.403599977493286
Iteration completed in 43.97941255569458
Iteration completed in 43.289087533950806
Iteration completed in 39.38067674636841
Iteration completed in 39.98015594482422


### Heterogeneous pt2

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

16    10235
15     4038
4      3683
6      3655
8      3535
10     3422
2      3411
11     3015
12     2778
1      2701
13     2588
14     2346
17     2131
dtype: int64

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.16092586517334
Iteration completed in 9.20356559753418
Iteration completed in 19.53793239593506
Iteration completed in 27.542187690734863
Iteration completed in 33.51002788543701
Iteration completed in 36.41244292259216
Iteration completed in 40.93513345718384
Iteration completed in 42.06718587875366
Iteration completed in 39.80843901634216
Iteration completed in 37.15682077407837
Iteration completed in 38.79934597015381
Iteration completed in 38.056703329086304
Iteration completed in 37.15167236328125
Iteration completed in 37.17699098587036
Iteration completed in 36.7239944934845
Iteration completed in 36.611796855926514
Iteration completed in 36.94538640975952
Iteration completed in 39.31953191757202
Iteration completed in 38.347946643829346
Iteration completed in 38.11800980567932
Iteration completed in 38.626651763916016


## Results

In [14]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.7486327303323517, 1)

In [10]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.7344341607067733, 1)

### Graphs creation

In [15]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [16]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_Degree.png", dpi=500)
plt.show()

  plt.show()


In [17]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 100)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_distrib_Degree.png", dpi=500)

plt.show()

  plt.show()
