In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
data = pd.read_csv("./RO_comms.txt", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

### N2V

In [3]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [4]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 41.81510043144226
Completed iteration in: 45.139074087142944
Completed iteration in: 46.60807538032532
Completed iteration in: 48.8662383556366
Completed iteration in: 46.313652753829956
Completed iteration in: 46.68782186508179
Completed iteration in: 46.10769462585449
Completed iteration in: 46.58239722251892
Completed iteration in: 47.263917684555054
Completed iteration in: 45.97070908546448
Completed iteration in: 46.52992916107178
Completed iteration in: 46.0703763961792
Completed iteration in: 45.76181435585022
Completed iteration in: 46.24411082267761
Completed iteration in: 46.17183589935303
Completed iteration in: 46.0146746635437
Completed iteration in: 46.90791082382202
Completed iteration in: 45.51440215110779
Completed iteration in: 44.991690158843994
Completed iteration in: 45.86496639251709
Completed iteration in: 46.691136837005615
Completed iteration in: 46.124799728393555
Completed iteration in: 45.70538377761841
Completed iteration in: 45.6618

In [5]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.847492,0.835084,0.849307
std,0.011397,0.029129,0.0112
min,0.823938,0.763707,0.825632
25%,0.839647,0.817056,0.841406
50%,0.846379,0.838514,0.848223
75%,0.854428,0.862419,0.856779
max,0.870257,0.881686,0.871331


In [6]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.823938,334.074217,0.5,1.0,80.0,10.0
11,0.832914,358.826658,0.5,0.5,80.0,10.0
19,0.834949,350.874268,1.0,2.0,80.0,10.0
10,0.835787,188.911285,0.5,0.5,40.0,10.0
5,0.838181,154.161536,0.5,1.0,80.0,5.0
15,0.8383,367.378012,1.0,0.5,80.0,10.0
3,0.840096,317.039041,1.0,1.0,80.0,10.0
6,0.840335,179.560204,0.5,1.0,40.0,10.0
13,0.843567,159.525569,1.0,0.5,80.0,5.0
17,0.844524,154.053255,1.0,2.0,80.0,5.0


In [7]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.763707,367.378012,1.0,0.5,80.0,10.0
5,0.794188,154.161536,0.5,1.0,80.0,5.0
3,0.798595,317.039041,1.0,1.0,80.0,10.0
10,0.804034,188.911285,0.5,0.5,40.0,10.0
9,0.807302,152.856183,0.5,0.5,80.0,5.0
6,0.812289,179.560204,0.5,1.0,40.0,10.0
11,0.818644,358.826658,0.5,0.5,80.0,10.0
14,0.819087,186.44177,1.0,0.5,40.0,10.0
7,0.81961,334.074217,0.5,1.0,80.0,10.0
19,0.832992,350.874268,1.0,2.0,80.0,10.0


In [8]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
7,0.825632,334.074217,0.5,1.0,80.0,10.0
11,0.833956,358.826658,0.5,0.5,80.0,10.0
19,0.837859,350.874268,1.0,2.0,80.0,10.0
10,0.838524,188.911285,0.5,0.5,40.0,10.0
15,0.840029,367.378012,1.0,0.5,80.0,10.0
5,0.840764,154.161536,0.5,1.0,80.0,5.0
3,0.84162,317.039041,1.0,1.0,80.0,10.0
6,0.842396,179.560204,0.5,1.0,40.0,10.0
13,0.844866,159.525569,1.0,0.5,80.0,5.0
23,0.846489,396.724931,2.0,1.0,80.0,10.0


### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 34.973692893981934
Completed iteration in: 34.88617920875549
Completed iteration in: 33.764947175979614
Completed iteration in: 33.64773964881897
Completed iteration in: 33.81943893432617
Completed iteration in: 33.830902099609375
Completed iteration in: 33.789947748184204
Completed iteration in: 33.46132755279541
Completed iteration in: 34.070077657699585
Completed iteration in: 34.794790744781494
Completed iteration in: 34.24219489097595
Completed iteration in: 33.99081635475159
Completed iteration in: 35.602662086486816
Completed iteration in: 30.801609992980957
Completed iteration in: 30.17422580718994


In [11]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.818863,0.81953,0.822088
std,0.053692,0.051357,0.050824
min,0.668462,0.70051,0.68396
25%,0.808737,0.804425,0.81157
50%,0.845003,0.842297,0.847105
75%,0.846858,0.85117,0.849169
max,0.859246,0.87761,0.861324


In [12]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.668462,25.814546,10.0,5.0
1,0.743148,49.297842,20.0,5.0
2,0.772591,94.867974,40.0,5.0
3,0.789348,178.922319,80.0,5.0
4,0.828127,59.892339,10.0,10.0
5,0.838779,119.407379,20.0,10.0
6,0.843088,232.0977,40.0,10.0
7,0.845003,446.217201,80.0,10.0
8,0.845961,130.357413,10.0,20.0
14,0.84608,833.834561,40.0,30.0


In [13]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.70051,25.814546,10.0,5.0
1,0.722779,49.297842,20.0,5.0
2,0.779363,94.867974,40.0,5.0
3,0.787281,178.922319,80.0,5.0
4,0.821569,59.892339,10.0,10.0
7,0.822466,446.217201,80.0,10.0
14,0.824261,833.834561,40.0,30.0
9,0.842297,256.848043,20.0,20.0
8,0.844075,130.357413,10.0,20.0
13,0.847089,398.917009,20.0,30.0


In [14]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.68396,25.814546,10.0,5.0
1,0.745598,49.297842,20.0,5.0
2,0.776974,94.867974,40.0,5.0
3,0.792131,178.922319,80.0,5.0
4,0.83101,59.892339,10.0,10.0
5,0.841232,119.407379,20.0,10.0
6,0.844777,232.0977,40.0,10.0
7,0.847105,446.217201,80.0,10.0
8,0.847816,130.357413,10.0,20.0
14,0.848047,833.834561,40.0,30.0


### MNMF

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 6.202125072479248
Completed iteration in: 5.5665998458862305
Completed iteration in: 10.825352191925049
Completed iteration in: 9.326264381408691
Completed iteration in: 15.2455894947052
Completed iteration in: 12.337901592254639
Completed iteration in: 25.742332935333252
Completed iteration in: 19.775558710098267


In [17]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,8.0,8.0,8.0
mean,0.71336,0.552591,0.706217
std,0.141634,0.158456,0.147898
min,0.487133,0.307589,0.470738
25%,0.640934,0.458096,0.630651
50%,0.758468,0.59324,0.752576
75%,0.823399,0.683324,0.82148
max,0.839138,0.705488,0.837621


In [19]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.487133,144.387542,8.0,200.0
0,0.528187,47.664274,8.0,100.0
3,0.678516,336.308679,16.0,200.0
2,0.698743,207.895045,16.0,100.0
4,0.818193,487.69828,32.0,100.0
5,0.818312,772.527251,32.0,200.0
6,0.838659,1141.203295,64.0,100.0
7,0.839138,1883.460323,64.0,200.0


In [20]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.307589,144.387542,8.0,200.0
0,0.347981,47.664274,8.0,100.0
3,0.494801,336.308679,16.0,200.0
2,0.537352,207.895045,16.0,100.0
5,0.649127,772.527251,32.0,200.0
4,0.677452,487.69828,32.0,100.0
7,0.700941,1883.460323,64.0,200.0
6,0.705488,1141.203295,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.470738,144.387542,8.0,200.0
0,0.512471,47.664274,8.0,100.0
3,0.670045,336.308679,16.0,200.0
2,0.689653,207.895045,16.0,100.0
5,0.815499,772.527251,32.0,200.0
4,0.816108,487.69828,32.0,100.0
7,0.837599,1883.460323,64.0,200.0
6,0.837621,1141.203295,64.0,100.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 10.400036334991455
Completed iteration in: 10.533987045288086
Completed iteration in: 10.410372495651245
Completed iteration in: 11.593565702438354
Completed iteration in: 15.645893335342407
Completed iteration in: 16.774592638015747
Completed iteration in: 15.536533832550049
Completed iteration in: 15.97313928604126
Completed iteration in: 26.677085161209106
Completed iteration in: 28.25245189666748
Completed iteration in: 25.88269853591919
Completed iteration in: 27.04776668548584
Completed iteration in: 11.54161548614502
Completed iteration in: 15.103352785110474
Completed iteration in: 25.859432220458984


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.635452,0.51551,0.624193
std,0.081827,0.091495,0.085177
min,0.516098,0.387715,0.498365
25%,0.537882,0.403027,0.522922
50%,0.660563,0.546296,0.649394
75%,0.714841,0.587246,0.707178
max,0.721125,0.625396,0.714577


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
12,0.516098,136.182096,"[32, 8]",200.0,200.0
3,0.527708,77.28074,"[32, 8]",100.0,100.0
2,0.528665,68.263189,"[32, 8]",50.0,100.0
0,0.530461,39.073625,"[32, 8]",50.0,50.0
1,0.545302,50.047976,"[32, 8]",100.0,50.0
6,0.647157,139.50002,"[64, 16]",50.0,100.0
13,0.652543,313.978146,"[64, 16]",200.0,200.0
7,0.660563,169.919596,"[64, 16]",100.0,100.0
5,0.663435,116.574285,"[64, 16]",100.0,50.0
4,0.667983,81.475821,"[64, 16]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
12,0.387715,136.182096,"[32, 8]",200.0,200.0
3,0.39107,77.28074,"[32, 8]",100.0,100.0
0,0.392728,39.073625,"[32, 8]",50.0,50.0
2,0.394151,68.263189,"[32, 8]",50.0,100.0
1,0.411903,50.047976,"[32, 8]",100.0,50.0
6,0.532571,139.50002,"[64, 16]",50.0,100.0
7,0.543901,169.919596,"[64, 16]",100.0,100.0
13,0.546296,313.978146,"[64, 16]",200.0,200.0
5,0.556472,116.574285,"[64, 16]",100.0,50.0
4,0.567886,81.475821,"[64, 16]",50.0,50.0


In [27]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
12,0.498365,136.182096,"[32, 8]",200.0,200.0
2,0.513292,68.263189,"[32, 8]",50.0,100.0
3,0.514392,77.28074,"[32, 8]",100.0,100.0
0,0.515541,39.073625,"[32, 8]",50.0,50.0
1,0.530302,50.047976,"[32, 8]",100.0,50.0
6,0.636448,139.50002,"[64, 16]",50.0,100.0
13,0.640489,313.978146,"[64, 16]",200.0,200.0
7,0.649394,169.919596,"[64, 16]",100.0,100.0
5,0.654081,116.574285,"[64, 16]",100.0,50.0
4,0.654726,81.475821,"[64, 16]",50.0,50.0
