In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./large_twitch_edges.csv", delimiter=",")

In [3]:
from sklearn.utils.random import sample_without_replacement

def sample_comb3(dims, nsamp):
    idx = sample_without_replacement(np.prod(dims), nsamp)
    return np.vstack(np.unravel_index(idx, dims)).T

l = sample_comb3((168114, 168114), 100000)

In [4]:
### Check how many of the edges are non unique
links_gen = list(map(lambda x: (x[0], x[1]), l))

non_unique = 0
for i in range(len(links_gen)):
    if (links_gen[i][1], links_gen[i][0]) in links_gen[i:]:      
        non_unique += 1
    if i % 10000 == 0:
        print(f"Processed {i} links")

print(f"Unique links: {len(pd.Series(list(map(lambda x: (x[0], x[1]), links_gen))).unique()) - non_unique}")

Processed 0 links
Processed 10000 links
Processed 20000 links
Processed 30000 links
Processed 40000 links
Processed 50000 links
Processed 60000 links
Processed 70000 links
Processed 80000 links
Processed 90000 links
Unique links: 99998


In [5]:
### Creating final links list
links = list(G.edges())
random.shuffle(links)
links = links[:200000]

links += links_gen

In [7]:
### Check how many of the links are in the graphs and not in the graph
t = 0
f = 0
for x, y in links:
    if G.has_edge(str(x), str(y)): t += 1
    else: f += 1
print(f"In/Not in: {t, f}")

In/Not in: (200054, 99946)


In [8]:
### Creating labels
y_data = []
for x, y in links:
    if G.has_edge(str(x), str(y)): y_data.append(1)
    else: y_data.append(0)

### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./Twitch_dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for i in range(tests_num):
    data = pd.read_csv("./Twitch_dw/dw_emb_vectors" + str(i) + ".csv", header=None, delimiter=";").values.tolist()
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 58.10507941246033
Classification completed in 54.93438196182251
Classification completed in 49.04243302345276
Classification completed in 53.112956523895264
Classification completed in 48.99031639099121
Classification completed in 48.75883483886719


In [11]:
pd.DataFrame(list(zip(f1, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.687883,430.843513,10.0,5.0
1,0.750383,852.987108,20.0,5.0
3,0.761383,1034.079826,10.0,10.0
2,0.793133,1684.428942,40.0,5.0
4,0.812317,2063.098696,20.0,10.0
5,0.826283,4032.707988,40.0,10.0


In [12]:
pd.Series(f1).describe()

count    6.000000
mean     0.771897
std      0.050347
min      0.687883
25%      0.753133
50%      0.777258
75%      0.807521
max      0.826283
dtype: float64

### AVPRA all features

In [29]:
obj = pd.read_pickle("All_feat/log_trial_0_LPStates.pickled") + \
            pd.read_pickle("All_feat/log_trial_1_LPStates.pickled")[1:] + \
                pd.read_pickle("All_feat/log_trial_2_LPStates.pickled")[1:] + \
                    pd.read_pickle("All_feat/log_trial_3_LPStates.pickled")[1:]

In [30]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f1 = []

for res in obj:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 7.6893391609191895
Classification completed in 46.20007681846619
Classification completed in 47.75898623466492
Classification completed in 43.1504967212677
Classification completed in 37.41994595527649
Classification completed in 26.50743079185486
Classification completed in 24.09148144721985
Classification completed in 18.169604301452637
Classification completed in 16.93538522720337
Classification completed in 19.18341827392578
Classification completed in 14.585596561431885
Classification completed in 14.228188514709473
Classification completed in 13.6672043800354
Classification completed in 13.086324214935303
Classification completed in 12.58159852027893
Classification completed in 16.279738664627075
Classification completed in 12.253262758255005
Classification completed in 12.021146059036255
Classification completed in 11.508776903152466
Classification completed in 10.234082460403442
Classification completed in 13.103567361831665
Classification completed 

In [31]:
max(f1), f1.index(max(f1))

(0.74195, 18)

In [32]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [10]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [f1[i] for i in l], "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("LP_allf.png", dpi=500)
plt.show()

  plt.show()


### AVPRA only lang

In [34]:
obj = []

In [35]:
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled") + \
            pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:] + \
                pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:] + \
                    pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [36]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)

f12 = []

for res in obj:
    data = res[1]
    start_time = time.time()
    
    # Input
    X_data = []
    for x, y in links:
        x = data[int(x)]
        y = data[int(y)]
        X_data.append([np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))])

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f12.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Classification completed in {time.time() - start_time}")

Classification completed in 6.6968584060668945
Classification completed in 24.835124731063843
Classification completed in 20.561519384384155
Classification completed in 19.721925020217896
Classification completed in 20.803082704544067
Classification completed in 19.061986923217773
Classification completed in 17.213862419128418
Classification completed in 17.828810214996338
Classification completed in 16.25322461128235
Classification completed in 19.191070795059204
Classification completed in 15.560381412506104
Classification completed in 15.241949558258057
Classification completed in 14.81319260597229
Classification completed in 13.816583395004272
Classification completed in 16.240943431854248
Classification completed in 13.160102605819702
Classification completed in 12.46449589729309
Classification completed in 12.117457628250122
Classification completed in 14.67012333869934
Classification completed in 11.328020572662354
Classification completed in 10.073093891143799
Classification co

In [37]:
max(f12), f12.index(max(f12))

(0.7952000000000001, 1)

In [4]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

In [12]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [f12[i] for i in l], "o", label="AVPRA F1-score", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("LP_onlyl.png", dpi=500)
plt.show()

  plt.show()


In [11]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [f12[i] for i in l], "o", label="AVPRA F1-score", markersize=10)
plt.plot(l, [f1[i] for i in l], "o", label="AVPRA* F1-score", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("LP_comparison.png", dpi=500)
plt.show()

  plt.show()
