In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 8.0108642578125e-05


In [4]:
### Degrees normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

In [5]:
l = list(range(0, 10)) + list(range(10, 30, 2))

# Heterogeneous 1

In [15]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [16]:
pd.Series(node_labels).value_counts()

7     20942
8     19965
6     19176
9     16972
5     15650
1     15565
10    12813
4     12199
11     8830
2      7157
3      6692
12     5503
13     3085
14     1678
15      857
16      452
17      267
18      139
19       86
20       48
21       23
22        7
24        4
23        4
dtype: int64

### AVPRA all feat

In [17]:
obj = []

In [18]:
obj = pd.read_pickle("./All_feat/log_trial_0_LPStates.pickled") + pd.read_pickle("./All_feat/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./All_feat/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./All_feat/log_trial_3_LPStates.pickled")[1:]

In [19]:
obj = [obj[i] for i in l]

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_all_2 = []
for res in obj:
    start_time = time.time()
   
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_all_2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 3.295698881149292
Completed iteration in 14.949729204177856
Completed iteration in 68.4543788433075
Completed iteration in 75.6669020652771
Completed iteration in 71.10734844207764
Completed iteration in 76.8880672454834
Completed iteration in 71.19984030723572
Completed iteration in 74.14402294158936
Completed iteration in 70.19854021072388
Completed iteration in 73.52725791931152
Completed iteration in 70.44321036338806
Completed iteration in 71.62367868423462
Completed iteration in 73.54709696769714
Completed iteration in 75.40009808540344
Completed iteration in 82.35192799568176
Completed iteration in 85.70594906806946
Completed iteration in 89.42560267448425
Completed iteration in 102.54922676086426
Completed iteration in 182.83702325820923
Completed iteration in 175.81162977218628


### AVPRA only lang

In [21]:
obj = []

In [22]:
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled") + pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [23]:
obj = [obj[i] for i in l]

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_only_2 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_only_2.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.7884480953216553
Completed iteration in 7.378140687942505
Completed iteration in 64.6985182762146
Completed iteration in 66.50079536437988
Completed iteration in 64.4956910610199
Completed iteration in 91.67194628715515
Completed iteration in 97.7742965221405
Completed iteration in 91.77427816390991
Completed iteration in 97.6442494392395
Completed iteration in 88.96565294265747
Completed iteration in 91.07397770881653
Completed iteration in 91.5280511379242
Completed iteration in 74.62460565567017
Completed iteration in 63.65137267112732
Completed iteration in 63.599340200424194
Completed iteration in 63.8297221660614
Completed iteration in 64.31491041183472
Completed iteration in 64.24934840202332
Completed iteration in 70.38204574584961
Completed iteration in 72.73244547843933


# Heterogeneous 2

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

1     153285
2       9471
3       2516
4       1036
5        530
6        288
7        207
8        133
16       127
9        104
11       104
12        83
10        75
13        47
17        45
14        38
15        25
dtype: int64

### AVPRA all feat

In [8]:
obj = pd.read_pickle("./All_feat/log_trial_0_LPStates.pickled") + pd.read_pickle("./All_feat/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./All_feat/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./All_feat/log_trial_3_LPStates.pickled")[1:]

In [9]:
obj = [obj[i] for i in l]

In [10]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_all_1 = []
for res in obj:
    start_time = time.time()
   
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_all_1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.9221246242523193
Completed iteration in 10.6644606590271
Completed iteration in 56.74378800392151
Completed iteration in 52.653939723968506
Completed iteration in 57.65579438209534
Completed iteration in 53.43042469024658
Completed iteration in 56.43837070465088
Completed iteration in 50.85349988937378
Completed iteration in 60.2674765586853
Completed iteration in 57.20895314216614
Completed iteration in 59.12400770187378
Completed iteration in 56.3473482131958
Completed iteration in 60.291446685791016
Completed iteration in 59.01368713378906
Completed iteration in 58.61095714569092
Completed iteration in 63.479398250579834
Completed iteration in 66.34580540657043
Completed iteration in 85.44818353652954
Completed iteration in 177.91800451278687
Completed iteration in 177.24749970436096


### AVPRA only lang

In [11]:
obj = []

In [12]:
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled") + pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:] + \
    pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:] + pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [13]:
obj = [obj[i] for i in l]

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1_scores_only_1 = []
for res in obj:
    start_time = time.time()
    
    X_data = list(map(lambda x: sorted(x, reverse=True), res[1]))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores_only_1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    
    print(f"Completed iteration in {time.time() - start_time}")

Completed iteration in 2.346672773361206
Completed iteration in 5.972807884216309
Completed iteration in 44.83026480674744
Completed iteration in 42.62898278236389
Completed iteration in 46.28037476539612
Completed iteration in 43.704256534576416
Completed iteration in 47.120314598083496
Completed iteration in 44.27061653137207
Completed iteration in 46.55402874946594
Completed iteration in 44.44453239440918
Completed iteration in 45.05700993537903
Completed iteration in 45.38632321357727
Completed iteration in 46.87142825126648
Completed iteration in 45.50168442726135
Completed iteration in 45.62984037399292
Completed iteration in 47.036736249923706
Completed iteration in 46.750805139541626
Completed iteration in 46.6671416759491
Completed iteration in 50.35727334022522
Completed iteration in 55.68504071235657


### Results

### Only lang

In [29]:
### Intervals b
max(f1_scores_only_1), l[f1_scores_only_1.index(max(f1_scores_only_1))]

(0.9915831424917467, 1)

In [30]:
### Intervals a
max(f1_scores_only_2), l[f1_scores_only_2.index(max(f1_scores_only_2))]

(0.6125866222526247, 1)

### All features

In [31]:
### Intervals b
max(f1_scores_all_1), l[f1_scores_all_1.index(max(f1_scores_all_1))]

(0.9916128840377123, 1)

In [32]:
### Intervals a
max(f1_scores_all_2), l[f1_scores_all_2.index(max(f1_scores_all_2))]

(0.8716057460666805, 1)

### Graphs

In [33]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [26]:
### Degree distrib function
serie = pd.Series(bet_l)
plt.figure(figsize=(10, 6))
plt.hist(serie, 200)
plt.xlabel("Grado", fontsize=22)
plt.ylabel("Frequenza assoluta", fontsize=22)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.xlim(-0.01, 1)

plt.savefig("Distribution_norm_Degree.png", dpi=500)

plt.show()

  plt.show()


In [35]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))

plt.plot(l, f1_scores_all_2, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores_all_1, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_allF_Degree.png", dpi=500)
plt.show()

  plt.show()


In [36]:
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))

plt.plot(l, f1_scores_only_2, "o", label="F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores_only_1, "o", label="F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_onlyL_Degree.png", dpi=500)
plt.show()

  plt.show()


In [34]:
### With o and x
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))

plt.plot(l, f1_scores_only_2, "o", label="AVPRA F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores_only_1, "o", label="AVPRA F1-score intervalli [b]", markersize=10)

plt.plot(l, f1_scores_all_2, "x", label="AVPRA* F1-score intervalli [a]", color="blue", markersize=12)
plt.plot(l, f1_scores_all_1, "x", label="AVPRA* F1-score intervalli [b]", color="red", markersize=12)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_both_Degree.png", dpi=500)
plt.show()

  plt.show()


In [37]:
### With o
plt.figure(figsize=(10, 6))
l = list(range(0, 10)) + list(range(10, 30, 2))

plt.plot(l, f1_scores_only_2, "o", label="AVPRA F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores_only_1, "o", label="AVPRA F1-score intervalli [b]", markersize=10)

plt.plot(l, f1_scores_all_2, "o", label="AVPRA* F1-score intervalli [a]", markersize=10)
plt.plot(l, f1_scores_all_1, "o", label="AVPRA* F1-score intervalli [b]", markersize=10)

plt.xlabel("Iterazione", fontsize=22)
plt.ylabel("F1-score", fontsize=22)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylim(0, 1)

plt.legend(loc="right", prop={'size': 16})

plt.savefig("Micro_comparison_both_O_Degree.png", dpi=500)
plt.show()

  plt.show()
