In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 0.00010633468627929688


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# Heterogeneous

In [13]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [14]:
pd.Series(node_labels).value_counts()

7     20942
8     19965
6     19176
9     16972
5     15650
1     15565
10    12813
4     12199
11     8830
2      7157
3      6692
12     5503
13     3085
14     1678
15      857
16      452
17      267
18      139
19       86
20       48
21       23
22        7
24        4
23        4
dtype: int64

### DW

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 139.52080130577087
Iteration completed in 156.20618224143982
Iteration completed in 218.52163577079773
Iteration completed in 165.68988370895386
Iteration completed in 169.88926672935486
Iteration completed in 157.60085678100586


In [17]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,6.0,6.0
mean,0.157833,0.06573
std,0.032486,0.019761
min,0.118252,0.041292
25%,0.140179,0.050352
50%,0.148693,0.066651
75%,0.178843,0.079256
max,0.205127,0.091411


In [18]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
2,0.118252,1684.428942,40.0,5.0
4,0.138268,2063.098696,20.0,10.0
5,0.145912,4032.707988,40.0,10.0
1,0.151474,852.987108,20.0,5.0
3,0.187967,1034.079826,10.0,10.0
0,0.205127,430.843513,10.0,5.0


In [19]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
2,0.041292,1684.428942,40.0,5.0
4,0.045823,2063.098696,20.0,10.0
1,0.063939,852.987108,20.0,5.0
3,0.069363,1034.079826,10.0,10.0
5,0.082553,4032.707988,40.0,10.0
0,0.091411,430.843513,10.0,5.0


# Heterogeneous pt2

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

1     153285
2       9471
3       2516
4       1036
5        530
6        288
7        207
8        133
16       127
9        104
11       104
12        83
10        75
13        47
17        45
14        38
15        25
dtype: int64

### DW

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [9]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 150.05650877952576
Iteration completed in 185.55354070663452
Iteration completed in 221.35382270812988
Iteration completed in 179.26041793823242
Iteration completed in 216.4745922088623
Iteration completed in 176.34801864624023


In [10]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,6.0,6.0
mean,0.911633,0.063265
std,0.001194,0.009178
min,0.909378,0.056105
25%,0.911616,0.056749
50%,0.911787,0.058593
75%,0.912426,0.070575
max,0.912649,0.075597


In [11]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
4,0.909378,2063.098696,20.0,10.0
5,0.911608,4032.707988,40.0,10.0
3,0.911638,1034.079826,10.0,10.0
0,0.911935,430.843513,10.0,5.0
1,0.91259,852.987108,20.0,5.0
2,0.912649,1684.428942,40.0,5.0


In [12]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
3,0.056105,1034.079826,10.0,10.0
2,0.056265,1684.428942,40.0,5.0
0,0.058199,430.843513,10.0,5.0
1,0.058987,852.987108,20.0,5.0
4,0.074437,2063.098696,20.0,10.0
5,0.075597,4032.707988,40.0,10.0
