In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from sklearn.utils.random import sample_without_replacement
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HU_edges.csv", delimiter=",")

In [3]:
### Generate random links
def sample_comb3(dims, nsamp):
    idx = sample_without_replacement(np.prod(dims), nsamp)
    return np.vstack(np.unravel_index(idx, dims)).T

l = sample_comb3((47538, 47538), 100000)

In [4]:
### Check how many of the edges are non unique
links_gen = list(map(lambda x: (x[0], x[1]), l))

non_unique = 0
for i in range(len(links_gen)):
    if (links_gen[i][1], links_gen[i][0]) in links_gen[i:]:      
        non_unique += 1
    if i % 10000 == 0:
        print(f"Processed {i} links")

print(f"Unique links: {len(pd.Series(list(map(lambda x: (x[0], x[1]), links_gen))).unique()) - non_unique}")

Processed 0 links
Processed 10000 links
Processed 20000 links
Processed 30000 links
Processed 40000 links
Processed 50000 links
Processed 60000 links
Processed 70000 links
Processed 80000 links
Processed 90000 links
Unique links: 99995


In [5]:
### Creating final links list
links = list(G.edges())
random.shuffle(links)

links += links_gen

In [6]:
### Check how many of the links are in the graphs and not in the graph
t = 0
f = 0
for x, y in links:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"In/Not in: {t, f}")

In/Not in: (222904, 99983)


In [7]:
### Creating labels
y_data = []
for x, y in links:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

### N2V

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [9]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HU_n2v/HU_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 43.064637184143066
Classification completed in 44.7082953453064
Classification completed in 49.00970792770386
Classification completed in 42.24790000915527
Classification completed in 50.31100559234619
Classification completed in 44.74914073944092
Classification completed in 49.968311071395874
Classification completed in 43.17291522026062
Classification completed in 48.120619773864746
Classification completed in 44.808950901031494
Classification completed in 45.743791818618774
Classification completed in 44.166064739227295
Classification completed in 50.81423497200012
Classification completed in 46.59412455558777
Classification completed in 45.10785150527954
Classification completed in 40.934879779815674
Classification completed in 46.694703340530396
Classification completed in 42.79775881767273
Classification completed in 33.82598066329956
Classification completed in 30.6907856464386
Classification completed in 38.36501169204712
Classification completed in 

In [10]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
20,0.968658,123.80008,2.0,1.0,40.0,5.0
12,0.969216,126.675527,1.0,0.5,40.0,5.0
8,0.97095,120.73153,0.5,0.5,40.0,5.0
0,0.971322,106.86908,1.0,1.0,40.0,5.0
4,0.974899,125.554072,0.5,1.0,40.0,5.0
16,0.975317,124.044357,1.0,2.0,40.0,5.0
21,0.980024,217.007686,2.0,1.0,80.0,5.0
13,0.980504,214.618617,1.0,0.5,80.0,5.0
9,0.981325,214.085608,0.5,0.5,80.0,5.0
1,0.981325,192.255094,1.0,1.0,80.0,5.0


In [11]:
pd.Series(f1).describe()

count    24.000000
mean      0.982955
std       0.007977
min       0.968658
25%       0.978847
50%       0.983601
75%       0.988522
max       0.992985
dtype: float64

### DW

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HU_dw/HU_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 48.107747077941895
Classification completed in 41.88802218437195
Classification completed in 41.903908491134644
Classification completed in 39.92848777770996
Classification completed in 39.94625210762024
Classification completed in 39.649558782577515
Classification completed in 39.40564775466919
Classification completed in 37.91938281059265
Classification completed in 39.16060757637024
Classification completed in 38.35513758659363
Classification completed in 38.42648005485535
Classification completed in 38.421956300735474
Classification completed in 40.40866708755493
Classification completed in 38.67958903312683
Classification completed in 30.0804762840271


In [14]:
pd.DataFrame(list(zip(f1, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.940785,36.506493,10.0,5.0
1,0.963904,68.210582,20.0,5.0
4,0.970501,87.543224,10.0,10.0
2,0.972622,133.75116,40.0,5.0
5,0.975224,170.240475,20.0,10.0
3,0.97541,266.070136,80.0,5.0
8,0.977562,185.26939,10.0,20.0
9,0.978832,369.003128,20.0,20.0
6,0.978878,334.773742,40.0,10.0
7,0.979033,681.768971,80.0,10.0


In [15]:
pd.Series(f1).describe()

count    15.000000
mean      0.974138
std       0.010284
min       0.940785
25%       0.973923
50%       0.978832
75%       0.979335
max       0.980442
dtype: float64

### MNMF

In [16]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [17]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HU_mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 41.36748504638672
Classification completed in 43.10439372062683
Classification completed in 39.19413781166077
Classification completed in 39.298360109329224
Classification completed in 38.719468116760254
Classification completed in 38.5053014755249
Classification completed in 49.99622654914856
Classification completed in 55.79487442970276


In [18]:
pd.DataFrame(list(zip(f1, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.838412,95.28386,8.0,100.0
1,0.843755,285.491338,8.0,200.0
2,0.876785,421.509037,16.0,100.0
3,0.87824,701.013073,16.0,200.0
4,0.908374,1211.796356,32.0,100.0
5,0.911735,2139.761206,32.0,200.0
6,0.941342,3373.030116,64.0,100.0
7,0.941575,5875.845202,64.0,200.0


In [19]:
pd.Series(f1).describe()

count    8.000000
mean     0.892527
std      0.039942
min      0.838412
25%      0.868527
50%      0.893307
75%      0.919137
max      0.941575
dtype: float64

### DANMF

In [20]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [21]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./HU_danmf/HU_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 70.884281873703
Classification completed in 61.86175346374512
Classification completed in 62.15903162956238
Classification completed in 65.96036314964294
Classification completed in 59.36114692687988
Classification completed in 57.82621169090271
Classification completed in 59.52793574333191
Classification completed in 57.555994749069214
Classification completed in 57.66989493370056
Classification completed in 55.99737286567688
Classification completed in 56.6594135761261
Classification completed in 56.372917890548706
Classification completed in 60.02294206619263
Classification completed in 56.549235343933105
Classification completed in 56.5062997341156


In [23]:
pd.DataFrame(list(zip(f1, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.678281,108.151318,"[32, 8]",50.0,100.0
0,0.691629,59.408877,"[32, 8]",50.0,50.0
3,0.693239,115.569006,"[32, 8]",100.0,100.0
12,0.724999,216.932015,"[32, 8]",50.0,50.0
1,0.732076,71.049077,"[32, 8]",100.0,50.0
6,0.792437,220.051627,"[64, 16]",50.0,100.0
4,0.793087,128.863256,"[64, 16]",50.0,50.0
7,0.798987,254.67847,"[64, 16]",100.0,100.0
13,0.801604,516.619871,"[64, 16]",100.0,50.0
5,0.805491,174.576857,"[64, 16]",100.0,50.0


In [24]:
pd.Series(f1).describe()

count    15.000000
mean      0.785481
std       0.065379
min       0.678281
25%       0.728538
50%       0.798987
75%       0.850746
max       0.858125
dtype: float64

### AVPRA

In [25]:
obj = pd.read_pickle("./HU.pickled")

In [26]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 23.033822059631348
Classification completed in 74.72668647766113
Classification completed in 66.11690974235535
Classification completed in 57.63560390472412
Classification completed in 50.45566391944885
Classification completed in 45.19944357872009
Classification completed in 40.03955292701721
Classification completed in 34.90533113479614
Classification completed in 32.69029474258423
Classification completed in 30.815114498138428
Classification completed in 30.482471704483032
Classification completed in 24.39063549041748
Classification completed in 21.442750692367554
Classification completed in 20.396126747131348
Classification completed in 18.053455352783203
Classification completed in 18.693721532821655
Classification completed in 19.638908863067627
Classification completed in 18.215312480926514
Classification completed in 17.821442127227783
Classification completed in 17.328871250152588
Classification completed in 17.74183487892151


In [27]:
max(f1)

0.8358728978909226

In [28]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [29]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1, "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA.png", dpi=500)
plt.show()

  plt.show()
