In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from sklearn.utils.random import sample_without_replacement
import random
import csv
%matplotlib inline

In [3]:
G = nx.read_edgelist("./RO_edges.csv", delimiter=",")

In [5]:
### Generate random links
def sample_comb3(dims, nsamp):
    idx = sample_without_replacement(np.prod(dims), nsamp)
    return np.vstack(np.unravel_index(idx, dims)).T

l = sample_comb3((41773, 41773), 100000)

In [8]:
### Check how many of the edges are non unique
links_gen = list(map(lambda x: (x[0], x[1]), l))

non_unique = 0
for i in range(len(links_gen)):
    if (links_gen[i][1], links_gen[i][0]) in links_gen[i:]:      
        non_unique += 1
    if i % 10000 == 0:
        print(f"Processed {i} links")

print(f"Unique links: {len(pd.Series(list(map(lambda x: (x[0], x[1]), links_gen))).unique()) - non_unique}")

Processed 0 links
Processed 10000 links
Processed 20000 links
Processed 30000 links
Processed 40000 links
Processed 50000 links
Processed 60000 links
Processed 70000 links
Processed 80000 links
Processed 90000 links
Unique links: 99997


In [9]:
### Creating final links list
links = list(G.edges())
random.shuffle(links)

links += links_gen

In [10]:
### Check how many of the links are in the graphs and not in the graph
t = 0
f = 0
for x, y in links:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"In/Not in: {t, f}")

In/Not in: (125846, 99980)


In [15]:
### Creating labels
y_data = []
for x, y in links:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

### N2V

In [24]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [28]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./RO_n2v/RO_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 24.853151082992554
Classification completed in 23.581544160842896
Classification completed in 23.431453704833984
Classification completed in 20.31038546562195
Classification completed in 22.527344226837158
Classification completed in 23.117784023284912
Classification completed in 21.572678327560425
Classification completed in 20.0987651348114
Classification completed in 23.90615940093994
Classification completed in 23.96958899497986
Classification completed in 21.865646600723267
Classification completed in 19.62451720237732
Classification completed in 22.897985696792603
Classification completed in 22.65594244003296
Classification completed in 22.4233136177063
Classification completed in 20.202679872512817
Classification completed in 20.87794780731201
Classification completed in 19.301513195037842
Classification completed in 19.09657907485962
Classification completed in 17.996344327926636
Classification completed in 21.161555290222168
Classification completed

In [29]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
20,0.981446,97.455682,2.0,1.0,40.0,5.0
12,0.982155,90.638427,1.0,0.5,40.0,5.0
0,0.983107,76.404218,1.0,1.0,40.0,5.0
8,0.983195,86.482981,0.5,0.5,40.0,5.0
16,0.98583,89.106015,1.0,2.0,40.0,5.0
13,0.98583,159.525569,1.0,0.5,80.0,5.0
4,0.986051,90.649346,0.5,1.0,40.0,5.0
21,0.986694,169.450061,2.0,1.0,80.0,5.0
9,0.987048,152.856183,0.5,0.5,80.0,5.0
1,0.987336,131.534873,1.0,1.0,80.0,5.0


In [30]:
pd.Series(f1).describe()

count    24.000000
mean      0.988606
std       0.004002
min       0.981446
25%       0.985996
50%       0.988697
75%       0.990911
max       0.994886
dtype: float64

### DW

In [33]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [34]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./RO_dw/RO_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 19.954283714294434
Classification completed in 21.695998907089233
Classification completed in 20.651201009750366
Classification completed in 19.27675700187683
Classification completed in 19.079931259155273
Classification completed in 25.020188808441162
Classification completed in 24.55047917366028
Classification completed in 22.257368087768555
Classification completed in 21.35203766822815
Classification completed in 21.239486694335938
Classification completed in 21.933610916137695
Classification completed in 20.109210968017578
Classification completed in 19.697892665863037
Classification completed in 19.77295160293579
Classification completed in 20.387173414230347


In [35]:
pd.DataFrame(list(zip(f1, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.968826,25.814546,10.0,5.0
1,0.981003,49.297842,20.0,5.0
4,0.984236,59.892339,10.0,10.0
2,0.984391,94.867974,40.0,5.0
5,0.986118,119.407379,20.0,10.0
3,0.986716,178.922319,80.0,5.0
8,0.986915,130.357413,10.0,20.0
6,0.987092,232.0977,40.0,10.0
12,0.987136,201.832897,10.0,30.0
7,0.988177,446.217201,80.0,10.0


In [36]:
pd.Series(f1).describe()

count    15.000000
mean      0.985615
std       0.005137
min       0.968826
25%       0.985254
50%       0.987092
75%       0.988465
max       0.989284
dtype: float64

### MNMF

In [37]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [38]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./RO_mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 29.608680248260498
Classification completed in 33.62402534484863
Classification completed in 30.55707836151123
Classification completed in 27.772759199142456
Classification completed in 27.636369466781616
Classification completed in 27.939178228378296
Classification completed in 28.57694721221924
Classification completed in 27.786049365997314


In [39]:
pd.DataFrame(list(zip(f1, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.835651,47.664274,8.0,100.0
1,0.836758,144.387542,8.0,200.0
3,0.886087,336.308679,16.0,200.0
2,0.886973,207.895045,16.0,100.0
4,0.92036,487.69828,32.0,100.0
5,0.92087,772.527251,32.0,200.0
6,0.949586,1141.203295,64.0,100.0
7,0.951069,1883.460323,64.0,200.0


In [40]:
pd.Series(f1).describe()

count    8.000000
mean     0.898419
std      0.045357
min      0.835651
25%      0.873755
50%      0.903666
75%      0.928049
max      0.951069
dtype: float64

### DANMF

In [41]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [42]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./RO_danmf/RO_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 41.84124565124512
Classification completed in 39.10538291931152
Classification completed in 37.55946636199951
Classification completed in 36.43886041641235
Classification completed in 34.446943283081055
Classification completed in 36.185367584228516
Classification completed in 36.187437295913696
Classification completed in 37.186312437057495
Classification completed in 34.514596462249756
Classification completed in 34.83430600166321
Classification completed in 32.81893730163574
Classification completed in 34.90584182739258
Classification completed in 37.02150082588196
Classification completed in 33.24601435661316
Classification completed in 33.181846618652344


In [44]:
pd.DataFrame(list(zip(f1, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
12,0.728225,136.182096,"[32, 8]",200.0,200.0
1,0.731568,50.047976,"[32, 8]",100.0,50.0
3,0.731767,77.28074,"[32, 8]",100.0,100.0
2,0.740513,68.263189,"[32, 8]",50.0,100.0
0,0.746336,39.073625,"[32, 8]",50.0,50.0
6,0.787185,139.50002,"[64, 16]",50.0,100.0
7,0.796174,169.919596,"[64, 16]",100.0,100.0
4,0.797325,81.475821,"[64, 16]",50.0,50.0
13,0.801266,313.978146,"[64, 16]",200.0,200.0
5,0.802307,116.574285,"[64, 16]",100.0,50.0


In [13]:
pd.Series(f1).describe()

count    15.000000
mean      0.801681
std       0.058171
min       0.728225
25%       0.743424
50%       0.797325
75%       0.870113
max       0.876323
dtype: float64

### AVPRA

In [47]:
obj = pd.read_pickle("./RO.pickled")

In [48]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 18.646276235580444
Classification completed in 47.166786193847656
Classification completed in 40.25831365585327
Classification completed in 38.20852756500244
Classification completed in 35.386948585510254
Classification completed in 33.04598832130432
Classification completed in 33.31483769416809
Classification completed in 31.053358554840088
Classification completed in 29.61680293083191
Classification completed in 30.088135480880737
Classification completed in 26.877888441085815
Classification completed in 26.09949040412903
Classification completed in 23.14518976211548
Classification completed in 22.786272048950195
Classification completed in 20.70828366279602
Classification completed in 19.146453857421875
Classification completed in 18.19884729385376
Classification completed in 17.908112049102783
Classification completed in 16.0206081867218
Classification completed in 17.210569620132446
Classification completed in 14.503860473632812


In [49]:
max(f1)

0.869193641234557

In [50]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [51]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1, "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA.png", dpi=500)
plt.show()

  plt.show()
