In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
data = pd.read_csv("./HR_comms.txt", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

### N2V

In [3]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [4]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 45.97557806968689
Completed iteration in: 46.377442359924316
Completed iteration in: 47.5812451839447
Completed iteration in: 44.45557141304016
Completed iteration in: 46.93637776374817
Completed iteration in: 45.74704694747925
Completed iteration in: 46.13062262535095
Completed iteration in: 45.45283102989197
Completed iteration in: 46.894251585006714
Completed iteration in: 45.12437677383423
Completed iteration in: 45.41667890548706
Completed iteration in: 45.18350052833557
Completed iteration in: 48.50412201881409
Completed iteration in: 45.51646542549133
Completed iteration in: 45.87464356422424
Completed iteration in: 46.33717465400696
Completed iteration in: 46.82514715194702
Completed iteration in: 44.049684047698975
Completed iteration in: 45.62450194358826
Completed iteration in: 44.837828159332275
Completed iteration in: 47.17175912857056
Completed iteration in: 45.51800537109375
Completed iteration in: 45.47628307342529
Completed iteration in: 45.3618

In [5]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.897687,0.896402,0.898865
std,0.018595,0.022523,0.018265
min,0.86688,0.858126,0.869285
25%,0.888617,0.884426,0.889898
50%,0.895969,0.893312,0.897087
75%,0.904604,0.908241,0.905978
max,0.926981,0.932314,0.927435


In [6]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
19,0.86688,695.624031,1.0,2.0,80.0,10.0
3,0.874118,575.860253,1.0,1.0,80.0,10.0
11,0.874576,700.444418,0.5,0.5,80.0,10.0
23,0.875218,709.758869,2.0,1.0,80.0,10.0
7,0.876592,687.228601,0.5,1.0,80.0,10.0
15,0.881539,696.772516,1.0,0.5,80.0,10.0
18,0.890976,383.037323,1.0,2.0,40.0,10.0
6,0.891892,397.617808,0.5,1.0,40.0,10.0
17,0.892716,340.624793,1.0,2.0,80.0,5.0
22,0.894824,400.727971,2.0,1.0,40.0,10.0


In [7]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
19,0.858126,695.624031,1.0,2.0,80.0,10.0
11,0.863485,700.444418,0.5,0.5,80.0,10.0
15,0.869743,696.772516,1.0,0.5,80.0,10.0
23,0.870972,709.758869,2.0,1.0,80.0,10.0
7,0.872458,687.228601,0.5,1.0,80.0,10.0
3,0.874016,575.860253,1.0,1.0,80.0,10.0
10,0.887896,388.473305,0.5,0.5,40.0,10.0
6,0.889756,397.617808,0.5,1.0,40.0,10.0
18,0.88995,383.037323,1.0,2.0,40.0,10.0
1,0.89169,289.630887,1.0,1.0,80.0,5.0


In [8]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
19,0.869285,695.624031,1.0,2.0,80.0,10.0
11,0.875695,700.444418,0.5,0.5,80.0,10.0
3,0.876221,575.860253,1.0,1.0,80.0,10.0
23,0.876367,709.758869,2.0,1.0,80.0,10.0
7,0.877903,687.228601,0.5,1.0,80.0,10.0
15,0.882899,696.772516,1.0,0.5,80.0,10.0
18,0.892231,383.037323,1.0,2.0,40.0,10.0
6,0.893042,397.617808,0.5,1.0,40.0,10.0
17,0.894446,340.624793,1.0,2.0,80.0,5.0
2,0.89556,331.992402,1.0,1.0,40.0,10.0


### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 33.7544584274292
Completed iteration in: 33.989905834198
Completed iteration in: 33.29111170768738
Completed iteration in: 32.789466857910156
Completed iteration in: 35.07798218727112
Completed iteration in: 33.96595644950867
Completed iteration in: 33.761709690093994
Completed iteration in: 33.46374177932739
Completed iteration in: 35.28178668022156
Completed iteration in: 34.476938247680664
Completed iteration in: 34.83988928794861
Completed iteration in: 34.08767652511597
Completed iteration in: 31.415771961212158
Completed iteration in: 30.713939428329468
Completed iteration in: 31.13910222053528


In [11]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.906086,0.909838,0.907841
std,0.018263,0.018811,0.017442
min,0.850023,0.853379,0.854227
25%,0.903161,0.904428,0.90511
50%,0.91388,0.917286,0.915109
75%,0.916399,0.920523,0.917679
max,0.918644,0.924838,0.920101


In [12]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.850023,38.241227,10.0,5.0
1,0.88667,72.052979,20.0,5.0
2,0.894457,142.735676,40.0,5.0
3,0.897755,288.967294,80.0,5.0
5,0.908566,171.306192,20.0,10.0
4,0.908933,90.918856,10.0,10.0
7,0.912872,670.22348,80.0,10.0
11,0.91388,1482.992684,80.0,20.0
10,0.915071,734.545967,40.0,20.0
6,0.916262,341.508353,40.0,10.0


In [13]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.853379,38.241227,10.0,5.0
1,0.892094,72.052979,20.0,5.0
2,0.894736,142.735676,40.0,5.0
3,0.900437,288.967294,80.0,5.0
5,0.908419,171.306192,20.0,10.0
4,0.91354,90.918856,10.0,10.0
11,0.915931,1482.992684,80.0,20.0
13,0.917286,699.778587,20.0,30.0
12,0.918211,310.089163,10.0,30.0
10,0.919109,734.545967,40.0,20.0


In [14]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.854227,38.241227,10.0,5.0
1,0.889569,72.052979,20.0,5.0
2,0.896457,142.735676,40.0,5.0
3,0.900331,288.967294,80.0,5.0
5,0.90989,171.306192,20.0,10.0
4,0.910281,90.918856,10.0,10.0
7,0.914566,670.22348,80.0,10.0
11,0.915109,1482.992684,80.0,20.0
10,0.916449,734.545967,40.0,20.0
12,0.917192,310.089163,10.0,30.0


### MNMF

In [15]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [16]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 7.259463787078857
Completed iteration in: 6.540953636169434
Completed iteration in: 15.3931405544281
Completed iteration in: 13.531736850738525
Completed iteration in: 22.3658607006073
Completed iteration in: 17.85519242286682
Completed iteration in: 31.33110737800598


In [17]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,7.0,7.0,7.0
mean,0.852366,0.815722,0.849133
std,0.074918,0.107764,0.080435
min,0.730463,0.630259,0.717665
25%,0.822767,0.776903,0.81774
50%,0.890792,0.863986,0.89066
75%,0.898397,0.880116,0.898655
max,0.902978,0.901773,0.902815


In [19]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.730463,1231.952486,8.0,200.0
0,0.756482,423.98246,8.0,100.0
3,0.889052,3089.716942,16.0,200.0
2,0.890792,1873.740525,16.0,100.0
4,0.898397,6102.296241,32.0,100.0
6,0.898397,18979.924733,64.0,100.0
5,0.902978,11782.491975,32.0,200.0


In [20]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.630259,1231.952486,8.0,200.0
0,0.692541,423.98246,8.0,100.0
5,0.861264,11782.491975,32.0,200.0
4,0.863986,6102.296241,32.0,100.0
3,0.878298,3089.716942,16.0,200.0
2,0.881934,1873.740525,16.0,100.0
6,0.901773,18979.924733,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.717665,1231.952486,8.0,200.0
0,0.746842,423.98246,8.0,100.0
3,0.888638,3089.716942,16.0,200.0
2,0.89066,1873.740525,16.0,100.0
4,0.89864,6102.296241,32.0,100.0
6,0.898669,18979.924733,64.0,100.0
5,0.902815,11782.491975,32.0,200.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 15.154385805130005
Completed iteration in: 15.699474811553955
Completed iteration in: 14.137388706207275
Completed iteration in: 14.734705924987793
Completed iteration in: 19.28646230697632
Completed iteration in: 19.56876850128174
Completed iteration in: 19.119604349136353
Completed iteration in: 19.306734085083008
Completed iteration in: 31.86449694633484
Completed iteration in: 32.552202463150024
Completed iteration in: 30.51341938972473
Completed iteration in: 25.457651376724243
Completed iteration in: 11.468650341033936
Completed iteration in: 14.54837441444397
Completed iteration in: 25.405653476715088


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.794082,0.753602,0.791323
std,0.062777,0.075691,0.065796
min,0.658727,0.624754,0.648988
25%,0.735456,0.679876,0.729971
50%,0.814475,0.779506,0.813135
75%,0.845671,0.816584,0.845698
max,0.856986,0.834278,0.856996


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
12,0.658727,560.116846,"[32, 8]",50.0,50.0
2,0.717911,268.152193,"[32, 8]",50.0,100.0
3,0.727806,278.286608,"[32, 8]",100.0,100.0
0,0.729546,151.062392,"[32, 8]",50.0,50.0
1,0.741365,164.564006,"[32, 8]",100.0,50.0
13,0.79716,1321.520216,"[64, 16]",100.0,50.0
6,0.808429,608.053479,"[64, 16]",50.0,100.0
7,0.814475,653.606651,"[64, 16]",100.0,100.0
5,0.828126,382.492228,"[64, 16]",100.0,50.0
4,0.830875,336.803176,"[64, 16]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
12,0.624754,560.116846,"[32, 8]",50.0,50.0
3,0.651953,278.286608,"[32, 8]",100.0,100.0
2,0.653349,268.152193,"[32, 8]",50.0,100.0
0,0.666726,151.062392,"[32, 8]",50.0,50.0
1,0.693026,164.564006,"[32, 8]",100.0,50.0
13,0.735313,1321.520216,"[64, 16]",100.0,50.0
7,0.765337,653.606651,"[64, 16]",100.0,100.0
6,0.779506,608.053479,"[64, 16]",50.0,100.0
14,0.804698,2270.463403,"[128, 32]",50.0,100.0
5,0.805998,382.492228,"[64, 16]",100.0,50.0


In [27]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
12,0.648988,560.116846,"[32, 8]",50.0,50.0
2,0.711863,268.152193,"[32, 8]",50.0,100.0
3,0.722495,278.286608,"[32, 8]",100.0,100.0
0,0.72329,151.062392,"[32, 8]",50.0,50.0
1,0.736653,164.564006,"[32, 8]",100.0,50.0
13,0.793305,1321.520216,"[64, 16]",100.0,50.0
6,0.805766,608.053479,"[64, 16]",50.0,100.0
7,0.813135,653.606651,"[64, 16]",100.0,100.0
5,0.827661,382.492228,"[64, 16]",100.0,50.0
4,0.829878,336.803176,"[64, 16]",50.0,50.0
