In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
data = pd.read_csv("./HU_comms.txt", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

### N2V

In [3]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [4]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 37.47399353981018
Completed iteration in: 38.420801877975464
Completed iteration in: 39.61255979537964
Completed iteration in: 41.767568588256836
Completed iteration in: 39.98410177230835
Completed iteration in: 39.95740222930908
Completed iteration in: 40.303332567214966
Completed iteration in: 40.39688849449158
Completed iteration in: 41.02097535133362
Completed iteration in: 40.15558695793152
Completed iteration in: 38.73620271682739
Completed iteration in: 39.28893208503723
Completed iteration in: 39.78379034996033
Completed iteration in: 38.92808723449707
Completed iteration in: 38.696877241134644
Completed iteration in: 39.923776388168335
Completed iteration in: 40.59238123893738
Completed iteration in: 39.48372006416321
Completed iteration in: 39.29574680328369
Completed iteration in: 40.17040657997131
Completed iteration in: 39.03217887878418
Completed iteration in: 38.733874797821045
Completed iteration in: 38.629554986953735
Completed iteration in: 38.

In [5]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.821549,0.806512,0.822497
std,0.022996,0.031453,0.02315
min,0.790808,0.755987,0.789325
25%,0.806268,0.786708,0.807333
50%,0.816786,0.79829,0.818147
75%,0.833351,0.82319,0.835426
max,0.860644,0.862382,0.861374


In [6]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
19,0.790808,461.019866,1.0,2.0,80.0,10.0
7,0.791018,485.59117,0.5,1.0,80.0,10.0
23,0.798065,500.880916,2.0,1.0,80.0,10.0
11,0.79838,468.974333,0.5,0.5,80.0,10.0
3,0.801851,436.778427,1.0,1.0,80.0,10.0
6,0.802482,253.726269,0.5,1.0,40.0,10.0
15,0.807531,499.899741,1.0,0.5,80.0,10.0
9,0.807636,214.085608,0.5,0.5,80.0,5.0
18,0.808582,248.378643,1.0,2.0,40.0,10.0
22,0.814262,248.845797,2.0,1.0,40.0,10.0


In [7]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
19,0.755987,461.019866,1.0,2.0,80.0,10.0
7,0.762287,485.59117,0.5,1.0,80.0,10.0
23,0.776339,500.880916,2.0,1.0,80.0,10.0
11,0.777094,468.974333,0.5,0.5,80.0,10.0
3,0.778836,436.778427,1.0,1.0,80.0,10.0
6,0.785942,253.726269,0.5,1.0,40.0,10.0
9,0.786963,214.085608,0.5,0.5,80.0,5.0
15,0.791537,499.899741,1.0,0.5,80.0,10.0
18,0.793762,248.378643,1.0,2.0,40.0,10.0
10,0.794797,254.258841,0.5,0.5,40.0,10.0


In [8]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
19,0.789325,461.019866,1.0,2.0,80.0,10.0
7,0.791063,485.59117,0.5,1.0,80.0,10.0
23,0.798484,500.880916,2.0,1.0,80.0,10.0
11,0.799646,468.974333,0.5,0.5,80.0,10.0
3,0.802753,436.778427,1.0,1.0,80.0,10.0
6,0.803734,253.726269,0.5,1.0,40.0,10.0
9,0.808533,214.085608,0.5,0.5,80.0,5.0
15,0.809242,499.899741,1.0,0.5,80.0,10.0
18,0.81016,248.378643,1.0,2.0,40.0,10.0
22,0.81519,248.845797,2.0,1.0,40.0,10.0


### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 30.49635076522827
Completed iteration in: 29.87781858444214
Completed iteration in: 29.60082244873047
Completed iteration in: 29.95871591567993
Completed iteration in: 29.629802942276
Completed iteration in: 29.18105125427246
Completed iteration in: 29.387110471725464
Completed iteration in: 29.469979286193848
Completed iteration in: 29.342081308364868
Completed iteration in: 29.466455698013306
Completed iteration in: 29.340846061706543
Completed iteration in: 29.82179045677185
Completed iteration in: 29.269107341766357
Completed iteration in: 29.416096687316895
Completed iteration in: 29.063231945037842


In [11]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.82581,0.818901,0.827126
std,0.046686,0.047641,0.045824
min,0.679533,0.674383,0.684816
25%,0.822781,0.813447,0.82368
50%,0.84655,0.837494,0.847591
75%,0.849548,0.847039,0.850532
max,0.856542,0.853797,0.857173


In [12]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.679533,36.506493,10.0,5.0
1,0.773454,68.210582,20.0,5.0
2,0.800905,133.75116,40.0,5.0
3,0.812369,266.070136,80.0,5.0
4,0.833193,87.543224,10.0,10.0
7,0.838873,681.768971,80.0,10.0
5,0.8435,170.240475,20.0,10.0
11,0.84655,1184.07436,80.0,20.0
9,0.846866,369.003128,20.0,20.0
6,0.848443,334.773742,40.0,10.0


In [13]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.674383,36.506493,10.0,5.0
1,0.761328,68.210582,20.0,5.0
2,0.790443,133.75116,40.0,5.0
3,0.801056,266.070136,80.0,5.0
4,0.825838,87.543224,10.0,10.0
7,0.826906,681.768971,80.0,10.0
5,0.836459,170.240475,20.0,10.0
9,0.837494,369.003128,20.0,20.0
11,0.840786,1184.07436,80.0,20.0
10,0.842193,623.870464,40.0,20.0


In [14]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.684816,36.506493,10.0,5.0
1,0.773889,68.210582,20.0,5.0
2,0.801846,133.75116,40.0,5.0
3,0.813489,266.070136,80.0,5.0
4,0.83387,87.543224,10.0,10.0
7,0.839944,681.768971,80.0,10.0
5,0.844448,170.240475,20.0,10.0
9,0.847591,369.003128,20.0,20.0
11,0.848125,1184.07436,80.0,20.0
6,0.84957,334.773742,40.0,10.0


### MNMF

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 7.376118183135986
Completed iteration in: 6.528241157531738
Completed iteration in: 14.279026746749878
Completed iteration in: 12.24210810661316
Completed iteration in: 20.56584072113037
Completed iteration in: 16.536006927490234
Completed iteration in: 37.02779197692871
Completed iteration in: 28.070369482040405


In [17]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,8.0,8.0,8.0
mean,0.774072,0.732287,0.770414
std,0.100546,0.131347,0.104752
min,0.615692,0.52846,0.605513
25%,0.739588,0.678488,0.734469
50%,0.813368,0.776679,0.811444
75%,0.844946,0.829548,0.844325
max,0.856647,0.847954,0.856119


In [19]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.615692,285.491338,8.0,200.0
0,0.620635,95.28386,8.0,100.0
3,0.779239,701.013073,16.0,200.0
2,0.785128,421.509037,16.0,100.0
5,0.841607,2139.761206,32.0,200.0
4,0.84308,1211.796356,32.0,100.0
7,0.850547,5875.845202,64.0,200.0
6,0.856647,3373.030116,64.0,100.0


In [20]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.52846,285.491338,8.0,200.0
0,0.536407,95.28386,8.0,100.0
3,0.725849,701.013073,16.0,200.0
2,0.737659,421.509037,16.0,100.0
5,0.815698,2139.761206,32.0,200.0
4,0.825961,1211.796356,32.0,100.0
7,0.840309,5875.845202,64.0,200.0
6,0.847954,3373.030116,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.605513,285.491338,8.0,200.0
0,0.610361,95.28386,8.0,100.0
3,0.775839,701.013073,16.0,200.0
2,0.782179,421.509037,16.0,100.0
5,0.840709,2139.761206,32.0,200.0
4,0.842353,1211.796356,32.0,100.0
7,0.850242,5875.845202,64.0,200.0
6,0.856119,3373.030116,64.0,100.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 10.394587278366089
Completed iteration in: 11.241891384124756
Completed iteration in: 11.324953079223633
Completed iteration in: 11.769464015960693
Completed iteration in: 14.559908628463745
Completed iteration in: 14.150871753692627
Completed iteration in: 13.975885152816772
Completed iteration in: 14.364198684692383
Completed iteration in: 25.251478910446167
Completed iteration in: 25.051452159881592
Completed iteration in: 24.245140075683594
Completed iteration in: 24.350826263427734
Completed iteration in: 10.82567310333252
Completed iteration in: 14.05940842628479
Completed iteration in: 23.309097290039062


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.641144,0.581833,0.627707
std,0.097102,0.102249,0.102394
min,0.447833,0.394715,0.425448
25%,0.551851,0.482439,0.533293
50%,0.663021,0.596011,0.649906
75%,0.728387,0.678674,0.719395
max,0.749684,0.705804,0.743742


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.447833,108.151318,"[32, 8]",50.0,100.0
0,0.514409,59.408877,"[32, 8]",50.0,50.0
3,0.531658,115.569006,"[32, 8]",100.0,100.0
12,0.536075,216.932015,"[32, 8]",50.0,50.0
1,0.567627,71.049077,"[32, 8]",100.0,50.0
5,0.659445,174.576857,"[64, 16]",100.0,50.0
13,0.661127,516.619871,"[64, 16]",100.0,50.0
6,0.663021,220.051627,"[64, 16]",50.0,100.0
7,0.663757,254.67847,"[64, 16]",100.0,100.0
4,0.687316,128.863256,"[64, 16]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
2,0.394715,108.151318,"[32, 8]",50.0,100.0
0,0.452016,59.408877,"[32, 8]",50.0,50.0
3,0.463284,115.569006,"[32, 8]",100.0,100.0
12,0.473568,216.932015,"[32, 8]",50.0,50.0
1,0.49131,71.049077,"[32, 8]",100.0,50.0
13,0.593885,516.619871,"[64, 16]",100.0,50.0
5,0.594307,174.576857,"[64, 16]",100.0,50.0
6,0.596011,220.051627,"[64, 16]",50.0,100.0
7,0.599243,254.67847,"[64, 16]",100.0,100.0
4,0.624218,128.863256,"[64, 16]",50.0,50.0


In [27]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
2,0.425448,108.151318,"[32, 8]",50.0,100.0
0,0.494902,59.408877,"[32, 8]",50.0,50.0
3,0.509268,115.569006,"[32, 8]",100.0,100.0
12,0.518432,216.932015,"[32, 8]",50.0,50.0
1,0.548154,71.049077,"[32, 8]",100.0,50.0
5,0.647048,174.576857,"[64, 16]",100.0,50.0
13,0.649612,516.619871,"[64, 16]",100.0,50.0
6,0.649906,220.051627,"[64, 16]",50.0,100.0
7,0.652001,254.67847,"[64, 16]",100.0,100.0
4,0.676363,128.863256,"[64, 16]",50.0,50.0
