In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./edges_norm.csv")

In [3]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 26.84650182723999


In [5]:
bet_l = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# Heterogeneous intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

1     158355
2       6524
3       1529
4        587
5        312
6        188
7        114
8         81
16        77
9         70
11        64
10        53
12        47
13        37
17        32
15        25
14        19
dtype: int64

### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 129.32689833641052
Iteration completed in 160.40623307228088
Iteration completed in 194.29215049743652
Iteration completed in 159.33435344696045
Iteration completed in 183.89550375938416
Iteration completed in 152.81885981559753


In [11]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,6.0,6.0
mean,0.941741,0.073723
std,0.001292,0.018461
min,0.940279,0.057013
25%,0.940844,0.057701
50%,0.941513,0.069822
75%,0.942472,0.086989
max,0.943729,0.09931


In [12]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
3,0.940279,1034.079826,10.0,10.0
4,0.940695,2063.098696,20.0,10.0
2,0.94129,1684.428942,40.0,5.0
1,0.941736,852.987108,20.0,5.0
5,0.942718,4032.707988,40.0,10.0
0,0.943729,430.843513,10.0,5.0


In [13]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
3,0.057013,1034.079826,10.0,10.0
1,0.057238,852.987108,20.0,5.0
0,0.059088,430.843513,10.0,5.0
4,0.080556,2063.098696,20.0,10.0
2,0.089133,1684.428942,40.0,5.0
5,0.09931,4032.707988,40.0,10.0


# Heterogeneous pt2

In [14]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [15]:
pd.Series(node_labels).value_counts()

7     23107
8     21904
6     21459
5     17279
9     17164
4     13011
1     12160
10    11682
3      9367
11     6817
2      6408
12     3680
13     1950
14      968
15      514
16      274
17      160
18       98
19       63
20       26
21       14
22        4
24        3
23        2
dtype: int64

### DW

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 118.4878785610199
Iteration completed in 127.39502239227295
Iteration completed in 169.31393003463745
Iteration completed in 140.5721137523651
Iteration completed in 171.35189247131348
Iteration completed in 163.5417559146881


In [19]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,6.0,6.0
mean,0.166761,0.072565
std,0.035125,0.018433
min,0.127978,0.040066
25%,0.14609,0.067026
50%,0.154284,0.076747
75%,0.189402,0.084501
max,0.219552,0.090957


In [20]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
2,0.127978,1684.428942,40.0,5.0
5,0.145704,4032.707988,40.0,10.0
4,0.14725,2063.098696,20.0,10.0
1,0.161318,852.987108,20.0,5.0
3,0.198763,1034.079826,10.0,10.0
0,0.219552,430.843513,10.0,5.0


In [21]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
2,0.040066,1684.428942,40.0,5.0
1,0.065607,852.987108,20.0,5.0
4,0.071283,2063.098696,20.0,10.0
3,0.082211,1034.079826,10.0,10.0
5,0.085264,4032.707988,40.0,10.0
0,0.090957,430.843513,10.0,5.0
