In [4]:
import pandas as pd
import networkx as nx
import time
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans

In [6]:
network = nx.read_edgelist("./edges_norm.csv")
nodes = list(network.nodes())
len(nodes)

168114

# Louvain communities (resolution = 0.5)

In [7]:
comms = nx.algorithms.community.louvain_communities(network, resolution=0.5)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### DeepWalk

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./Twitch_dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./Twitch_dw/dw_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 49.94239592552185
Iteration completed in 41.121949195861816
Iteration completed in 34.37436604499817
Iteration completed in 36.953006982803345
Iteration completed in 34.03091096878052
Iteration completed in 34.037792444229126


In [13]:
pd.Series(scores).describe()

count    6.000000
mean     0.461302
std      0.004162
min      0.455931
25%      0.459331
50%      0.460031
75%      0.464791
max      0.466347
dtype: float64

In [14]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.455931,430.843513,10.0,5.0
2,0.459226,1684.428942,40.0,5.0
5,0.459644,4032.707988,40.0,10.0
4,0.460417,2063.098696,20.0,10.0
3,0.466248,1034.079826,10.0,10.0
1,0.466347,852.987108,20.0,5.0


### AVPRA only lang

In [15]:
obj = pd.read_pickle("Only_lang/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("Only_lang/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_3_LPStates.pickled")[1:]

In [20]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

  kmeans = KMeans(n_clusters=len(comms)).fit(res[1])


Iteration completed in 93.41961312294006
Iteration completed in 11.578054904937744
Iteration completed in 15.071037769317627
Iteration completed in 13.922539949417114
Iteration completed in 15.685786008834839
Iteration completed in 15.86241626739502
Iteration completed in 16.79807949066162
Iteration completed in 18.480243921279907
Iteration completed in 17.651992082595825
Iteration completed in 16.436674118041992
Iteration completed in 17.901914596557617
Iteration completed in 21.84894108772278
Iteration completed in 17.134069204330444
Iteration completed in 18.266036987304688
Iteration completed in 15.819348335266113
Iteration completed in 16.803406238555908
Iteration completed in 17.83398985862732
Iteration completed in 15.1203773021698
Iteration completed in 15.980843544006348
Iteration completed in 15.04005742073059
Iteration completed in 14.837553977966309
Iteration completed in 16.450689792633057
Iteration completed in 17.867724895477295
Iteration completed in 15.692425012588501


In [21]:
pd.Series(scores).describe()

count    29.000000
mean      0.521420
std       0.026024
min       0.496495
25%       0.502295
50%       0.512877
75%       0.537936
max       0.614936
dtype: float64

In [22]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.6149355880958314, 0)

### AVPRA all features

In [23]:
obj = []

In [24]:
obj = pd.read_pickle("All_feat/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("All_feat/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_3_LPStates.pickled")[1:]

In [25]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.404463529586792
Iteration completed in 17.46074676513672
Iteration completed in 25.0404109954834
Iteration completed in 17.686028003692627
Iteration completed in 21.8604154586792
Iteration completed in 17.052441120147705
Iteration completed in 19.060866355895996
Iteration completed in 19.93187189102173
Iteration completed in 18.229860067367554
Iteration completed in 18.09838342666626
Iteration completed in 19.900814533233643
Iteration completed in 20.80716347694397
Iteration completed in 18.084402561187744
Iteration completed in 16.70637273788452
Iteration completed in 18.61631464958191
Iteration completed in 17.875815868377686
Iteration completed in 17.005292654037476
Iteration completed in 17.508642196655273
Iteration completed in 17.78141450881958
Iteration completed in 16.210591316223145
Iteration completed in 17.115148305892944
Iteration completed in 17.351863384246826
Iteration completed in 16.85884952545166
Iteration completed in 16.065467596054077
Itera

In [26]:
pd.Series(scores).describe()

count    33.000000
mean      0.551404
std       0.059104
min       0.384357
25%       0.523544
50%       0.568757
75%       0.593104
max       0.617860
dtype: float64

In [27]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.6178600045565932, 21)

# Leiden communities

In [28]:
### Identifying Leiden communities
from cdlib import algorithms
comms = algorithms.leiden(network)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'graph_tool'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [29]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]
comms = list(pd.Series(comms_l).unique())

### DeepWalk

In [30]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./Twitch_dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [31]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./Twitch_dw/dw_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 25.555725812911987
Iteration completed in 23.25350785255432
Iteration completed in 20.449732303619385
Iteration completed in 26.175666332244873
Iteration completed in 26.37688899040222
Iteration completed in 22.73935317993164


In [32]:
pd.Series(scores).describe()

count    6.000000
mean     0.558071
std      0.014727
min      0.540729
25%      0.548013
50%      0.555393
75%      0.569353
max      0.577353
dtype: float64

In [33]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
5,0.540729,4032.707988,40.0,10.0
2,0.545806,1684.428942,40.0,5.0
3,0.554631,1034.079826,10.0,10.0
0,0.556155,430.843513,10.0,5.0
4,0.573752,2063.098696,20.0,10.0
1,0.577353,852.987108,20.0,5.0


### AVPRA only lang

In [34]:
obj = pd.read_pickle("Only_lang/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("Only_lang/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_3_LPStates.pickled")[1:]

In [35]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 83.17044997215271
Iteration completed in 8.472028017044067
Iteration completed in 9.278973579406738
Iteration completed in 8.960134744644165
Iteration completed in 9.610278606414795
Iteration completed in 11.06383991241455
Iteration completed in 9.823399305343628
Iteration completed in 11.440415859222412
Iteration completed in 10.544093132019043
Iteration completed in 10.229927062988281
Iteration completed in 11.008828401565552
Iteration completed in 11.443581819534302
Iteration completed in 12.866600751876831
Iteration completed in 11.21533727645874
Iteration completed in 12.49155569076538
Iteration completed in 12.284487247467041
Iteration completed in 9.813379526138306
Iteration completed in 11.69726824760437
Iteration completed in 10.8393235206604
Iteration completed in 11.927825689315796
Iteration completed in 11.158539056777954
Iteration completed in 10.431700229644775
Iteration completed in 9.904802083969116
Iteration completed in 12.30871319770813
Iterati

In [36]:
pd.Series(scores).describe()

count    29.000000
mean      0.487675
std       0.007404
min       0.468464
25%       0.481466
50%       0.490783
75%       0.492762
max       0.500247
dtype: float64

In [37]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.5002466081229369, 6)

### AVPRA all features

In [38]:
obj = []

In [39]:
obj = pd.read_pickle("All_feat/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("All_feat/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_3_LPStates.pickled")[1:]

In [40]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.513599157333374
Iteration completed in 12.505852937698364
Iteration completed in 12.54334831237793
Iteration completed in 11.305163145065308
Iteration completed in 11.292146444320679
Iteration completed in 11.164754629135132
Iteration completed in 11.06567907333374
Iteration completed in 11.622767925262451
Iteration completed in 12.003674745559692
Iteration completed in 12.344833612442017
Iteration completed in 11.075608968734741
Iteration completed in 11.70937466621399
Iteration completed in 11.368999481201172
Iteration completed in 10.664387941360474
Iteration completed in 10.673967361450195
Iteration completed in 10.253332376480103
Iteration completed in 11.355802059173584
Iteration completed in 10.84235429763794
Iteration completed in 11.43644404411316
Iteration completed in 10.952480792999268
Iteration completed in 11.006418466567993
Iteration completed in 9.649899244308472
Iteration completed in 10.086345911026001
Iteration completed in 11.1312735080719
I

In [41]:
pd.Series(scores).describe()

count    33.000000
mean      0.498101
std       0.054745
min       0.293893
25%       0.472064
50%       0.498005
75%       0.545574
max       0.570746
dtype: float64

In [42]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.5707461853489736, 10)

# Standard Louvain

In [43]:
comms = nx.algorithms.community.louvain_communities(network)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### DeepWalk

In [44]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./Twitch_dw/twitch_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [45]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./Twitch_dw/dw_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 18.684234619140625
Iteration completed in 20.115649700164795
Iteration completed in 21.93905782699585
Iteration completed in 19.640246629714966
Iteration completed in 20.045379877090454
Iteration completed in 21.082722425460815


In [46]:
pd.Series(scores).describe()

count    6.000000
mean     0.573731
std      0.008063
min      0.558269
25%      0.573925
50%      0.575660
75%      0.577254
max      0.581663
dtype: float64

In [47]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.558269,430.843513,10.0,5.0
3,0.573612,1034.079826,10.0,10.0
1,0.574863,852.987108,20.0,5.0
2,0.576456,1684.428942,40.0,5.0
5,0.57752,4032.707988,40.0,10.0
4,0.581663,2063.098696,20.0,10.0


### AVPRA only lang

In [48]:
obj = pd.read_pickle("Only_lang/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("Only_lang/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("Only_lang/log_trial_3_LPStates.pickled")[1:]

In [49]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.7652294635772705
Iteration completed in 6.5879151821136475
Iteration completed in 7.352309465408325
Iteration completed in 7.286115884780884
Iteration completed in 6.6576621532440186
Iteration completed in 7.3289453983306885
Iteration completed in 8.290010690689087
Iteration completed in 7.310366868972778
Iteration completed in 7.345384120941162
Iteration completed in 7.40246844291687
Iteration completed in 7.205644369125366
Iteration completed in 8.159144639968872
Iteration completed in 7.37564492225647
Iteration completed in 6.890023469924927
Iteration completed in 7.465459585189819
Iteration completed in 7.584427833557129
Iteration completed in 7.130307674407959
Iteration completed in 7.407140731811523
Iteration completed in 7.3476033210754395
Iteration completed in 7.082919120788574
Iteration completed in 7.376626968383789
Iteration completed in 7.954266309738159
Iteration completed in 8.051700115203857
Iteration completed in 7.487272262573242
Iteration com

In [50]:
pd.Series(scores).describe()

count    29.000000
mean      0.472908
std       0.012054
min       0.450924
25%       0.464176
50%       0.473756
75%       0.479091
max       0.498063
dtype: float64

In [51]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.4980633564414983, 2)

### AVPRA all features

In [52]:
obj = []

In [53]:
obj = pd.read_pickle("All_feat/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("All_feat/log_trial_1_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_2_LPStates.pickled")[1:]\
    + pd.read_pickle("All_feat/log_trial_3_LPStates.pickled")[1:]

In [54]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.420169115066528
Iteration completed in 9.48097014427185
Iteration completed in 9.082396030426025
Iteration completed in 7.902409791946411
Iteration completed in 7.316471099853516
Iteration completed in 8.639440298080444
Iteration completed in 7.187133073806763
Iteration completed in 7.392616271972656
Iteration completed in 8.570434808731079
Iteration completed in 8.682812929153442
Iteration completed in 8.12022590637207
Iteration completed in 8.593220472335815
Iteration completed in 7.5059638023376465
Iteration completed in 7.879255294799805
Iteration completed in 7.8538665771484375
Iteration completed in 7.710444927215576
Iteration completed in 7.36380934715271
Iteration completed in 8.395882368087769
Iteration completed in 8.566935062408447
Iteration completed in 8.504862070083618
Iteration completed in 8.125445127487183
Iteration completed in 8.49448561668396
Iteration completed in 7.3278443813323975
Iteration completed in 7.811573505401611
Iteration complet

In [55]:
pd.Series(scores).describe()

count    33.000000
mean      0.476739
std       0.051200
min       0.265559
25%       0.454892
50%       0.475070
75%       0.507492
max       0.557059
dtype: float64

In [56]:
max(scores), (list(range(0, 29)))[scores.index(max(scores))]

(0.5570588067237451, 10)