In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
from itertools import permutations
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [5]:
len(list(G.nodes()))

778

In [6]:
### Generating random links
perm = permutations(G.nodes(), 2)
l = list(perm)
random.shuffle(l)
l = l[:55000]

In [7]:
### Checking how many of the links are already in the graph
t = 0
f = 0
for x, y in l:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
t, f

(4642, 50358)

In [8]:
### Creating the final list of links
l2 = list(G.edges())
random.shuffle(l2)
l3 = l2
z = list(map(lambda x: tuple(x), l))
for x, y in z:
    l3.append((str(x), str(y)))
l3 = list(pd.Series(l3).unique())

In [9]:
### Checking the proportion of links in the graph and not in the graph
t = 0
f = 0
for x, y in l3:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"Edges in: {t}, not in: {f}")

Edges in: 50492, not in: 50358


In [10]:
### Creating labels for the classification
y_data = []
for x, y in l3:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

# Using cosine similarity

### N2V

In [11]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_len:" in line:
            walk_len.append(float(line[10:-1]))
        if "num_walks:" in line:
            walk_num.append(float(line[10:-1]))

In [12]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 17.30066752433777
Classification completed in 18.232832431793213
Classification completed in 16.078786611557007
Classification completed in 17.20681381225586
Classification completed in 18.320362329483032
Classification completed in 17.69763207435608
Classification completed in 16.707271099090576
Classification completed in 16.79551124572754
Classification completed in 16.2435884475708
Classification completed in 16.252206325531006
Classification completed in 17.919450283050537
Classification completed in 17.169945001602173
Classification completed in 17.127931594848633
Classification completed in 16.53184413909912
Classification completed in 17.41111183166504
Classification completed in 16.67901635169983
Classification completed in 16.093984603881836
Classification completed in 17.200079917907715
Classification completed in 16.586543560028076
Classification completed in 14.65467619895935
Classification completed in 13.94481348991394
Classification completed

In [13]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
19,0.715568,16.335857,1.0,2.0,80.0,10.0
23,0.721269,14.734928,2.0,1.0,80.0,10.0
11,0.722062,17.349393,0.5,0.5,80.0,10.0
7,0.722757,13.87576,0.5,1.0,80.0,10.0
9,0.723451,11.893926,0.5,0.5,80.0,5.0
17,0.72469,12.942001,1.0,2.0,80.0,5.0
2,0.725136,7.874896,1.0,1.0,40.0,10.0
14,0.725632,13.128179,1.0,0.5,40.0,10.0
22,0.727219,11.564198,2.0,1.0,40.0,10.0
3,0.72826,10.022111,1.0,1.0,80.0,10.0


In [14]:
pd.Series(f1).describe()

count    24.000000
mean      0.730390
std       0.007120
min       0.715568
25%       0.725025
50%       0.729623
75%       0.736230
max       0.741596
dtype: float64

### DW

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_len:" in line:
            walk_len.append(float(line[10:-1]))
        if "num_walks:" in line:
            walk_num.append(float(line[10:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 12.75719141960144
Classification completed in 12.554570198059082
Classification completed in 12.553688049316406
Classification completed in 12.234835386276245
Classification completed in 13.570468187332153
Classification completed in 13.273082733154297
Classification completed in 13.495827913284302
Classification completed in 14.684361696243286
Classification completed in 15.545490503311157
Classification completed in 14.203025341033936
Classification completed in 13.857622146606445
Classification completed in 12.24808931350708
Classification completed in 13.814459562301636
Classification completed in 13.138517141342163
Classification completed in 12.694257259368896


In [17]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
0,0.724294,0.402893,1.0,1.0,10.0,5.0
3,0.734804,2.473622,1.0,1.0,80.0,5.0
8,0.735449,1.912276,0.5,0.5,10.0,20.0
12,0.735449,2.839129,1.0,0.5,10.0,30.0
1,0.735944,0.685927,1.0,1.0,20.0,5.0
4,0.735994,1.075852,0.5,1.0,10.0,10.0
9,0.736787,3.54515,0.5,0.5,20.0,20.0
5,0.739564,1.798483,0.5,1.0,20.0,10.0
6,0.743728,3.092509,0.5,1.0,40.0,10.0
2,0.744026,1.407827,1.0,1.0,40.0,5.0


In [18]:
pd.Series(f1).describe()

count    15.000000
mean      0.743368
std       0.011619
min       0.724294
25%       0.735697
50%       0.739564
75%       0.749851
max       0.766237
dtype: float64

### MNMF

In [19]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()

    start_time2 = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        v1 = data[int(x) - 1]
        v2 = data[int(y) - 1]
        X_data.append([np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    end_time2 = time.time()
    print(f"Classification completed in {end_time2 - start_time2}")

Classification completed in 9.70695185661316
Classification completed in 10.237005949020386
Classification completed in 11.721257209777832
Classification completed in 10.068088293075562
Classification completed in 12.587967157363892
Classification completed in 11.102945804595947
Classification completed in 13.621671199798584
Classification completed in 12.422095537185669
Classification completed in 14.179595947265625
Classification completed in 13.59572982788086


In [21]:
pd.DataFrame(list(zip(f1, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
8,0.740605,24.153916,0.5,0.5,10.0,20.0
6,0.742985,13.537538,0.5,1.0,40.0,10.0
4,0.748537,7.710691,0.5,1.0,10.0,10.0
7,0.751165,39.003183,0.5,1.0,80.0,10.0
9,0.759296,55.908975,0.5,0.5,20.0,20.0
3,0.75999,2.778379,1.0,1.0,80.0,5.0
2,0.760288,1.368036,1.0,1.0,40.0,5.0
5,0.760833,18.921276,0.5,1.0,20.0,10.0
0,0.764303,0.995836,1.0,1.0,10.0,5.0
1,0.767328,1.749397,1.0,1.0,20.0,5.0


In [22]:
pd.Series(f1).describe()

count    10.000000
mean      0.755533
std       0.009123
min       0.740605
25%       0.749194
50%       0.759643
75%       0.760697
max       0.767328
dtype: float64

### AVPRA

In [23]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj[1:]:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in l3:
        x = data[int(x) - 1]
        y = data[int(y) - 1]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 13.3321373462677
Classification completed in 13.732078790664673
Classification completed in 13.601150751113892
Classification completed in 13.00435185432434
Classification completed in 13.143681526184082
Classification completed in 12.857078790664673
Classification completed in 12.82499647140503
Classification completed in 12.73482608795166
Classification completed in 13.290181636810303
Classification completed in 13.721590757369995
Classification completed in 13.566513061523438
Classification completed in 12.369187593460083
Classification completed in 10.73883318901062
Classification completed in 10.832464456558228
Classification completed in 10.022976636886597
Classification completed in 8.871507406234741
Classification completed in 8.221402168273926
Classification completed in 7.712164402008057
Classification completed in 7.099161863327026
Classification completed in 6.5682761669158936


In [25]:
max(f1), f1.index(max(f1))

(0.7795736241943481, 0)

In [28]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [29]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(1,10)) + list(range(10, 32, 2))
plt.plot(l, f1, "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=5, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_POPULITE_cosine_AVPRA.png", dpi=500)
plt.show()

  plt.show()
