In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 30.147184133529663


In [4]:
bet_l = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [5]:
l = list(range(0, 10)) + list(range(10, 30, 2))

# Heterogeneous intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

1     158355
2       6524
3       1529
4        587
5        312
6        188
7        114
8         81
16        77
9         70
11        64
10        53
12        47
13        37
17        32
15        25
14        19
dtype: int64

### AVPRA all feat

In [8]:
obj = pd.read_pickle("./All_feat/log_trial_0_LPStates.pickled") + pd.read_pickle("./All_feat/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./All_feat/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./All_feat/log_trial_3_LPStates.pickled")[1:]

In [9]:
obj = [obj[i] for i in l]

In [10]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_all_1 = []
for res in obj:
    start_time = time.time()
   
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_all_1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.6748414039611816
Completed iteration in 9.186293601989746
Completed iteration in 49.772648334503174
Completed iteration in 47.48737597465515
Completed iteration in 49.91695761680603
Completed iteration in 47.5269033908844
Completed iteration in 51.45666003227234
Completed iteration in 46.05638003349304
Completed iteration in 47.022024393081665
Completed iteration in 45.344332456588745
Completed iteration in 47.08179998397827
Completed iteration in 49.83602261543274
Completed iteration in 50.85501003265381
Completed iteration in 50.79762840270996
Completed iteration in 50.53306555747986
Completed iteration in 55.408421993255615
Completed iteration in 61.13941407203674
Completed iteration in 83.37274479866028
Completed iteration in 173.64308714866638
Completed iteration in 188.39622282981873


### AVPRA only lang

In [11]:
obj = []

In [12]:
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled") + pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [13]:
obj = [obj[i] for i in l]

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_only_1 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_only_1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.3594810962677
Completed iteration in 6.246383428573608
Completed iteration in 44.73842763900757
Completed iteration in 42.1638298034668
Completed iteration in 43.96728277206421
Completed iteration in 40.94746994972229
Completed iteration in 42.12675380706787
Completed iteration in 41.21927785873413
Completed iteration in 40.98764085769653
Completed iteration in 41.88897156715393
Completed iteration in 41.29587912559509
Completed iteration in 41.00460648536682
Completed iteration in 41.34976601600647
Completed iteration in 41.99521446228027
Completed iteration in 41.344868421554565
Completed iteration in 43.32020902633667
Completed iteration in 41.75895643234253
Completed iteration in 43.02928566932678
Completed iteration in 46.468817949295044
Completed iteration in 55.403444051742554


# Heterogeneous pt2

In [15]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [16]:
pd.Series(node_labels).value_counts()

7     23107
8     21904
6     21459
5     17279
9     17164
4     13011
1     12160
10    11682
3      9367
11     6817
2      6408
12     3680
13     1950
14      968
15      514
16      274
17      160
18       98
19       63
20       26
21       14
22        4
24        3
23        2
dtype: int64

### AVPRA all feat

In [17]:
obj = []

In [18]:
obj = pd.read_pickle("./All_feat/log_trial_0_LPStates.pickled") + pd.read_pickle("./All_feat/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./All_feat/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./All_feat/log_trial_3_LPStates.pickled")[1:]

In [19]:
obj = [obj[i] for i in l]

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_all_2 = []
for res in obj:
    start_time = time.time()
   
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_all_2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 3.092564105987549
Completed iteration in 13.548569440841675
Completed iteration in 65.6003668308258
Completed iteration in 70.37129282951355
Completed iteration in 66.8019630908966
Completed iteration in 70.06498265266418
Completed iteration in 67.22136425971985
Completed iteration in 71.7561867237091
Completed iteration in 68.17057418823242
Completed iteration in 72.73516726493835
Completed iteration in 67.80450344085693
Completed iteration in 70.23805594444275
Completed iteration in 69.88181352615356
Completed iteration in 72.5732524394989
Completed iteration in 76.16222953796387
Completed iteration in 78.33146500587463
Completed iteration in 82.08087158203125
Completed iteration in 100.15921425819397
Completed iteration in 162.3116054534912
Completed iteration in 167.76312136650085


### AVPRA only lang

In [21]:
obj = []

In [22]:
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled") + pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [23]:
obj = [obj[i] for i in l]

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_only_2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_only_2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.7532527446746826
Completed iteration in 8.799601554870605
Completed iteration in 55.822981119155884
Completed iteration in 60.128320932388306
Completed iteration in 57.36158728599548
Completed iteration in 59.590903520584106
Completed iteration in 59.477296590805054
Completed iteration in 61.048754930496216
Completed iteration in 58.45052218437195
Completed iteration in 58.6615035533905
Completed iteration in 59.534000396728516
Completed iteration in 58.760990381240845
Completed iteration in 59.250404596328735
Completed iteration in 62.17939853668213
Completed iteration in 58.82445406913757
Completed iteration in 57.70843839645386
Completed iteration in 59.701101541519165
Completed iteration in 60.22072982788086
Completed iteration in 61.98526954650879
Completed iteration in 69.58278965950012


### Results

### Only lang

In [25]:
max(f1_scores_only_1), l[f1_scores_only_1.index(max(f1_scores_only_1))]

(0.9754929661243792, 1)

In [26]:
max(f1_scores_only_2), l[f1_scores_only_2.index(max(f1_scores_only_2))]

(0.4168872497992446, 1)

### All features

In [27]:
max(f1_scores_all_1), l[f1_scores_all_1.index(max(f1_scores_all_1))]

(0.9780804806233828, 1)

In [28]:
max(f1_scores_all_2), l[f1_scores_all_2.index(max(f1_scores_all_2))]

(0.5870088927222437, 1)

### Graphs

In [29]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [30]:
### PageRank distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Pagerank", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_PageRank.png", dpi=500)

plt.show()

  plt.show()


In [31]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_all_1, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, f1_scores_all_2, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_allF_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [32]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_only_1, "o", label="F1-score intervalli [b]", markersize=10)
plt.plot(l, f1_scores_only_2, "o", label="F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_onlyL_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [33]:
### With o and x
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_only_1, "o", label="AVPRA F1-score intervalli [b]", markersize=10)
plt.plot(l, f1_scores_only_2, "o", label="AVPRA F1-score intervalli [c]", markersize=10)

plt.plot(l, f1_scores_all_1, "x", label="AVPRA* F1-score intervalli [b]", color="blue", markersize=12)
plt.plot(l, f1_scores_all_2, "x", label="AVPRA* F1-score intervalli [c]", color="red", markersize=12)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_both_PageRank.png", dpi=500)
plt.show()

  plt.show()


In [34]:
### With o
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_only_1, "o", label="AVPRA F1-score intervalli [b]", markersize=10)
plt.plot(l, f1_scores_only_2, "o", label="AVPRA F1-score intervalli [c]", markersize=10)

plt.plot(l, f1_scores_all_1, "o", label="AVPRA* F1-score intervalli [b]", markersize=10)
plt.plot(l, f1_scores_all_2, "o", label="AVPRA* F1-score intervalli [c]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_both_O_PageRank.png", dpi=500)
plt.show()

  plt.show()
