In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
data = pd.read_csv("./twitch_comms.csv", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

### DW

In [3]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./Twitch_dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [4]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./Twitch_dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 105.49885535240173
Completed iteration in: 101.1717836856842
Completed iteration in: 100.9141776561737
Completed iteration in: 100.70342254638672
Completed iteration in: 102.60185956954956
Completed iteration in: 101.40108752250671


In [5]:
df = pd.DataFrame(columns = ['Micro', 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,6.0,6.0,6.0
mean,0.845428,0.847821,0.842361
std,0.010695,0.022981,0.010777
min,0.823692,0.815944,0.820442
25%,0.848385,0.842596,0.845612
50%,0.849761,0.8443,0.846498
75%,0.850288,0.851043,0.847103
max,0.850965,0.887066,0.848108


In [6]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.823692,430.843513,10.0,5.0
3,0.848021,1034.079826,10.0,10.0
2,0.849478,1684.428942,40.0,5.0
4,0.850043,2063.098696,20.0,10.0
5,0.85037,4032.707988,40.0,10.0
1,0.850965,852.987108,20.0,5.0


In [7]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.815944,430.843513,10.0,5.0
3,0.842242,1034.079826,10.0,10.0
4,0.843657,2063.098696,20.0,10.0
1,0.844944,852.987108,20.0,5.0
5,0.853076,4032.707988,40.0,10.0
2,0.887066,1684.428942,40.0,5.0


In [8]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.820442,430.843513,10.0,5.0
3,0.845459,1034.079826,10.0,10.0
2,0.846068,1684.428942,40.0,5.0
5,0.846927,4032.707988,40.0,10.0
4,0.847162,2063.098696,20.0,10.0
1,0.848108,852.987108,20.0,5.0
