In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from sklearn.utils.random import sample_without_replacement
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HR_edges.csv", delimiter=",")

In [3]:
### Generate random links
def sample_comb3(dims, nsamp):
    idx = sample_without_replacement(np.prod(dims), nsamp)
    return np.vstack(np.unravel_index(idx, dims)).T

l = sample_comb3((54573, 54573), 100000)

In [4]:
### Check how many of the edges are non unique
links_gen = list(map(lambda x: (x[0], x[1]), l))

non_unique = 0
for i in range(len(links_gen)):
    if (links_gen[i][1], links_gen[i][0]) in links_gen[i:]:      
        non_unique += 1
    if i % 10000 == 0:
        print(f"Processed {i} links")

print(f"Unique links: {len(pd.Series(list(map(lambda x: (x[0], x[1]), links_gen))).unique()) - non_unique}")

Processed 0 links
Processed 10000 links
Processed 20000 links
Processed 30000 links
Processed 40000 links
Processed 50000 links
Processed 60000 links
Processed 70000 links
Processed 80000 links
Processed 90000 links
Unique links: 99996


In [5]:
### Creating final links list
links = list(G.edges())
random.shuffle(links)

links += links_gen

In [6]:
### Check how many of the links are in the graphs and not in the graph
t = 0
f = 0
for x, y in links:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"In/Not in: {t, f}")

In/Not in: (498234, 99968)


In [7]:
### Creating labels
y_data = []
for x, y in links:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

### N2V

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [9]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HR_n2v/HR_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 134.30253267288208
Classification completed in 124.10565638542175
Classification completed in 128.06311130523682
Classification completed in 107.5228865146637
Classification completed in 129.19291067123413
Classification completed in 121.41244125366211
Classification completed in 103.25939869880676
Classification completed in 87.85652446746826
Classification completed in 99.13149523735046
Classification completed in 92.97686743736267
Classification completed in 117.42691993713379
Classification completed in 116.42022395133972
Classification completed in 134.2146337032318
Classification completed in 121.65085792541504
Classification completed in 122.39996361732483
Classification completed in 109.4820077419281
Classification completed in 102.73042011260986
Classification completed in 97.35949039459229
Classification completed in 90.6203145980835
Classification completed in 96.26015973091125
Classification completed in 111.9254698753357
Classification completed

In [10]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
20,0.949959,228.784479,2.0,1.0,40.0,5.0
12,0.950636,224.094112,1.0,0.5,40.0,5.0
8,0.952274,213.255738,0.5,0.5,40.0,5.0
0,0.953177,181.323636,1.0,1.0,40.0,5.0
4,0.954012,227.120782,0.5,1.0,40.0,5.0
16,0.9554,234.924095,1.0,2.0,40.0,5.0
13,0.96049,346.081429,1.0,0.5,80.0,5.0
21,0.961443,349.057279,2.0,1.0,80.0,5.0
9,0.962438,336.869814,0.5,0.5,80.0,5.0
1,0.962496,289.630887,1.0,1.0,80.0,5.0


In [11]:
pd.Series(f1).describe()

count    24.000000
mean      0.967087
std       0.011819
min       0.949959
25%       0.959218
50%       0.966504
75%       0.973815
max       0.984788
dtype: float64

### DW

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HR_dw/HR_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 83.43693709373474
Classification completed in 87.55218720436096
Classification completed in 81.11609482765198
Classification completed in 81.96799349784851
Classification completed in 82.96507930755615
Classification completed in 85.57638645172119
Classification completed in 114.01271605491638
Classification completed in 95.8836920261383
Classification completed in 85.98043942451477
Classification completed in 82.2931067943573
Classification completed in 91.4009690284729
Classification completed in 88.75083637237549
Classification completed in 96.13046193122864
Classification completed in 93.60912442207336
Classification completed in 97.11507415771484


In [15]:
pd.DataFrame(list(zip(f1, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.936878,38.241227,10.0,5.0
1,0.957966,72.052979,20.0,5.0
4,0.962504,90.918856,10.0,10.0
2,0.965213,142.735676,40.0,5.0
3,0.968247,288.967294,80.0,5.0
5,0.968247,171.306192,20.0,10.0
8,0.968656,188.168018,10.0,20.0
11,0.969459,1482.992684,80.0,20.0
14,0.969617,1403.005183,40.0,30.0
13,0.969943,699.778587,20.0,30.0


In [16]:
pd.Series(f1).describe()

count    15.000000
mean      0.965927
std       0.008790
min       0.936878
25%       0.966730
50%       0.969459
75%       0.970278
max       0.970604
dtype: float64

### MNMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HR_mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 100.22495293617249
Classification completed in 87.0292694568634
Classification completed in 78.22877287864685
Classification completed in 82.12731695175171
Classification completed in 101.03593635559082
Classification completed in 101.60131931304932
Classification completed in 85.31836557388306


In [19]:
pd.DataFrame(list(zip(f1, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.883485,1231.952486,8.0,200.0
0,0.884546,423.98246,8.0,100.0
2,0.903787,1873.740525,16.0,100.0
3,0.904757,3089.716942,16.0,200.0
5,0.92495,11782.491975,32.0,200.0
4,0.924967,6102.296241,32.0,100.0
6,0.946181,18979.924733,64.0,100.0


In [20]:
pd.Series(f1).describe()

count    7.000000
mean     0.910382
std      0.022995
min      0.883485
25%      0.894167
50%      0.904757
75%      0.924959
max      0.946181
dtype: float64

### DANMF

In [21]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [22]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HR_danmf/HR_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 87.44727635383606
Classification completed in 95.52147006988525
Classification completed in 104.67900705337524
Classification completed in 103.48978972434998
Classification completed in 98.29066824913025
Classification completed in 87.33339786529541
Classification completed in 93.07516574859619
Classification completed in 87.96330785751343
Classification completed in 109.66120433807373
Classification completed in 112.89096570014954
Classification completed in 110.00688767433167
Classification completed in 108.31985139846802
Classification completed in 103.08507990837097
Classification completed in 100.50685048103333
Classification completed in 110.11692571640015


In [23]:
pd.DataFrame(list(zip(f1, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.839236,268.152193,"[32, 8]",50.0,100.0
0,0.840456,151.062392,"[32, 8]",50.0,50.0
12,0.840958,560.116846,"[32, 8]",50.0,50.0
3,0.841626,278.286608,"[32, 8]",100.0,100.0
1,0.847318,164.564006,"[32, 8]",100.0,50.0
13,0.882457,1321.520216,"[64, 16]",100.0,50.0
6,0.886343,608.053479,"[64, 16]",50.0,100.0
4,0.888575,336.803176,"[64, 16]",50.0,50.0
5,0.889127,382.492228,"[64, 16]",100.0,50.0
7,0.88972,653.606651,"[64, 16]",100.0,100.0


In [24]:
pd.Series(f1).describe()

count    15.000000
mean      0.877983
std       0.027611
min       0.839236
25%       0.844472
50%       0.888575
75%       0.901409
max       0.911418
dtype: float64

### AVPRA

In [25]:
obj = pd.read_pickle("./HR.pickled")

In [26]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 39.27347016334534
Classification completed in 111.2534191608429
Classification completed in 85.71443343162537
Classification completed in 60.44496464729309
Classification completed in 57.102601528167725
Classification completed in 46.012123584747314
Classification completed in 43.63479208946228
Classification completed in 43.316386699676514
Classification completed in 44.544116497039795
Classification completed in 38.009697914123535
Classification completed in 39.72118353843689
Classification completed in 29.210760354995728
Classification completed in 29.98549771308899
Classification completed in 24.891069173812866
Classification completed in 28.19650888442993
Classification completed in 24.76248526573181
Classification completed in 27.06900644302368
Classification completed in 23.2750403881073
Classification completed in 29.646493434906006
Classification completed in 31.708303451538086
Classification completed in 30.893842935562134


In [30]:
max(f1), f1.index(max(f1))

(0.9060104813567256, 11)

In [28]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [29]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1, "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA.png", dpi=500)
plt.show()

  plt.show()
