In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HR_edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 8.177757263183594e-05


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [5]:
obj = pd.read_pickle("./HR.pickled")

# Heterogeneous pt1

In [6]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [7]:
pd.Series(node_labels).value_counts()

17    7602
16    7186
18    7146
15    6526
14    6289
13    4977
19    3084
9     2716
11    2693
12    2654
1     2330
20    1034
21     267
22      60
23       6
24       3
dtype: int64

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.3661346435546875
Iteration completed in 13.497879266738892
Iteration completed in 25.271319150924683
Iteration completed in 37.75277304649353
Iteration completed in 42.477304220199585
Iteration completed in 45.98876667022705
Iteration completed in 53.26049757003784
Iteration completed in 46.13143587112427
Iteration completed in 45.115803956985474
Iteration completed in 48.62360763549805
Iteration completed in 48.00973725318909
Iteration completed in 49.44280672073364
Iteration completed in 46.85432720184326
Iteration completed in 49.56969928741455
Iteration completed in 50.198171854019165
Iteration completed in 47.45077180862427
Iteration completed in 46.96292543411255
Iteration completed in 45.705955505371094
Iteration completed in 45.34983730316162
Iteration completed in 46.267860412597656
Iteration completed in 41.866870641708374


# Heterogeneous pt2

In [9]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [10]:
pd.Series(node_labels).value_counts()

1     7739
2     5214
3     4663
11    4171
4     4043
16    3965
5     3526
6     3000
12    2916
7     2586
8     2448
13    2422
9     2152
10    1902
14    1859
15    1478
17     489
dtype: int64

In [11]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
norm_f1_scores2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    norm_f1_scores2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.3731257915496826
Iteration completed in 12.735950231552124
Iteration completed in 23.863534450531006
Iteration completed in 35.517030239105225
Iteration completed in 41.521759271621704
Iteration completed in 48.32052755355835
Iteration completed in 50.84865093231201
Iteration completed in 51.97603368759155
Iteration completed in 51.795809507369995
Iteration completed in 52.98626375198364
Iteration completed in 52.80021834373474
Iteration completed in 51.713135719299316
Iteration completed in 52.82065486907959
Iteration completed in 52.838236808776855
Iteration completed in 52.21218991279602
Iteration completed in 51.93068075180054
Iteration completed in 48.81129312515259
Iteration completed in 47.34089255332947
Iteration completed in 46.3666775226593
Iteration completed in 46.78534507751465
Iteration completed in 45.59001326560974


## Results

In [12]:
max(norm_f1_scores), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores.index(max(norm_f1_scores))]

(0.6829134218964727, 1)

In [13]:
max(norm_f1_scores2), (list(range(0, 10)) + list(range(10, 32, 2)))[norm_f1_scores2.index(max(norm_f1_scores2))]

(0.5521759047182776, 1)

### Graphs creation

In [14]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [15]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 32, 2))
plt.plot(l, norm_f1_scores, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, norm_f1_scores2, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="upper right", prop={'size': 16})

plt.savefig("Micro_comparison_norm_Degree.png", dpi=500)
plt.show()

  plt.show()


In [16]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 100)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_distrib_Degree.png", dpi=500)

plt.show()

  plt.show()
