In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
from itertools import permutations
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
### Delete nodes without links
nodes = []
for (x, y) in G.edges():
    if(x == y):
        if(H.has_node(x)):
            nodes.append(x)
            H.remove_node(x)
G = H

In [5]:
len(list(H.nodes()))

765

In [6]:
### Generating random links
perm = permutations(G.nodes(), 2)
l = list(perm)
random.shuffle(l)
l = l[:55000]

In [7]:
### Checking how many of the links are already in the graph
t = 0
f = 0
for x, y in l:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
t, f

(4766, 50234)

In [8]:
### Creating the final list of links
l2 = list(G.edges())
random.shuffle(l2)
l3 = l2
z = list(map(lambda x: tuple(x), l))
for x, y in z:
    l3.append((str(x), str(y)))
l3 = list(pd.Series(l3).unique())

In [9]:
### Checking the proportion of links in the graph and not in the graph
t = 0
f = 0
for x, y in l3:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"Edges in: {t}, not in: {f}")

Edges in: 50294, not in: 50234


In [10]:
### Creating labels for the classification
y_data = []
for x, y in l3:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

# Using cosine similarity

### N2V

In [11]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_len:" in line:
            walk_len.append(float(line[10:-1]))
        if "num_walks:" in line:
            walk_num.append(float(line[10:-1]))

In [12]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 13.423301219940186
Classification completed in 13.510839700698853
Classification completed in 13.970709800720215
Classification completed in 14.558967351913452
Classification completed in 13.61346983909607
Classification completed in 13.658937931060791
Classification completed in 13.713175535202026
Classification completed in 14.086867094039917
Classification completed in 13.551257371902466
Classification completed in 13.576369285583496
Classification completed in 14.107378721237183
Classification completed in 13.644821882247925
Classification completed in 13.748415231704712
Classification completed in 14.161105155944824
Classification completed in 13.464900016784668
Classification completed in 13.899577617645264
Classification completed in 13.825636863708496
Classification completed in 13.499151945114136
Classification completed in 15.161000967025757
Classification completed in 13.53865385055542
Classification completed in 13.295662879943848
Classification 

In [13]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
23,0.710186,14.734928,2.0,1.0,80.0,10.0
19,0.715309,16.335857,1.0,2.0,80.0,10.0
22,0.719039,11.564198,2.0,1.0,40.0,10.0
1,0.719338,7.993166,1.0,1.0,80.0,5.0
15,0.719934,16.266213,1.0,0.5,80.0,10.0
11,0.719934,17.349393,0.5,0.5,80.0,10.0
7,0.720631,13.87576,0.5,1.0,80.0,10.0
9,0.720879,11.893926,0.5,0.5,80.0,5.0
3,0.720929,10.022111,1.0,1.0,80.0,10.0
13,0.721824,13.568689,1.0,0.5,80.0,5.0


In [14]:
pd.Series(f1).describe()

count    24.000000
mean      0.725492
std       0.008097
min       0.710186
25%       0.720457
50%       0.723665
75%       0.729247
max       0.741769
dtype: float64

### DW

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_len:" in line:
            walk_len.append(float(line[10:-1]))
        if "num_walks:" in line:
            walk_num.append(float(line[10:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 12.285937786102295
Classification completed in 11.980792760848999
Classification completed in 11.911150217056274
Classification completed in 11.753988265991211
Classification completed in 12.649070024490356
Classification completed in 12.155914545059204
Classification completed in 11.91851258277893
Classification completed in 11.33264422416687
Classification completed in 11.915221929550171
Classification completed in 12.573204040527344
Classification completed in 11.193545579910278
Classification completed in 10.882594108581543
Classification completed in 12.241767406463623
Classification completed in 11.821019411087036
Classification completed in 11.393658638000488


In [17]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
0,0.721327,0.402893,1.0,1.0,10.0,5.0
12,0.723316,2.839129,1.0,0.5,10.0,30.0
4,0.73013,1.075852,0.5,1.0,10.0,10.0
9,0.734308,3.54515,0.5,0.5,20.0,20.0
3,0.735949,2.473622,1.0,1.0,80.0,5.0
8,0.736248,1.912276,0.5,0.5,10.0,20.0
5,0.736745,1.798483,0.5,1.0,20.0,10.0
2,0.737839,1.407827,1.0,1.0,40.0,5.0
1,0.738138,0.685927,1.0,1.0,20.0,5.0
6,0.738685,3.092509,0.5,1.0,40.0,10.0


In [18]:
pd.Series(f1).describe()

count    15.000000
mean      0.739431
std       0.010507
min       0.721327
25%       0.735129
50%       0.737839
75%       0.748160
max       0.756093
dtype: float64

### MNMF

In [19]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 8.775446653366089
Classification completed in 8.54050064086914
Classification completed in 9.66746973991394
Classification completed in 9.588709354400635
Classification completed in 11.42306900024414
Classification completed in 10.733274459838867
Classification completed in 12.800683736801147
Classification completed in 11.84658145904541
Classification completed in 15.036087989807129
Classification completed in 11.666430234909058


In [21]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
6,0.740774,13.537538,0.5,1.0,40.0,10.0
8,0.747886,24.153916,0.5,0.5,10.0,20.0
4,0.74898,7.710691,0.5,1.0,10.0,10.0
7,0.750075,39.003183,0.5,1.0,80.0,10.0
5,0.752412,18.921276,0.5,1.0,20.0,10.0
2,0.752611,1.368036,1.0,1.0,40.0,5.0
0,0.762558,0.995836,1.0,1.0,10.0,5.0
3,0.763951,2.778379,1.0,1.0,80.0,5.0
9,0.7642,55.908975,0.5,0.5,20.0,20.0
1,0.765045,1.749397,1.0,1.0,20.0,5.0


In [22]:
pd.Series(f1).describe()

count    10.000000
mean      0.754849
std       0.008488
min       0.740774
25%       0.749254
50%       0.752512
75%       0.763603
max       0.765045
dtype: float64

### AVPRA

In [23]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj[1:]:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        x = data[int(x) - 1]
        y = data[int(y) - 1]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 9.854811429977417
Classification completed in 11.936106204986572
Classification completed in 10.841482400894165
Classification completed in 11.571316480636597
Classification completed in 10.927136182785034
Classification completed in 10.533207654953003
Classification completed in 10.371700525283813
Classification completed in 10.976463317871094
Classification completed in 10.248457431793213
Classification completed in 10.554164171218872
Classification completed in 9.855355739593506
Classification completed in 8.965590238571167
Classification completed in 9.005253553390503
Classification completed in 8.366992950439453
Classification completed in 7.105306386947632
Classification completed in 6.910343408584595
Classification completed in 6.341107368469238
Classification completed in 5.444311141967773
Classification completed in 5.1020307540893555
Classification completed in 4.50076150894165


In [25]:
max(f1), f1.index(max(f1))

(0.7794688152790212, 0)

In [26]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [28]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(1,10)) + list(range(10, 32, 2))
plt.plot(l, f1, "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=5, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_PI_AVPRA.png", dpi=500)
plt.show()

  plt.show()
