In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [5]:
len(G.nodes())

778

In [6]:
i_time = time.time()
bet = G.in_degree()
print(f"Calculated degrees in: {time.time() - i_time}")

Calculated degrees in: 7.677078247070312e-05


In [7]:
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in sorted(list(map(lambda x: int(x), list(G.nodes())))):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [8]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [9]:
pd.Series(node_labels).value_counts()

1     174
2     164
3     150
4      88
5      43
6      39
7      30
8      30
9      14
10     12
11      9
12      7
14      6
13      4
15      3
20      2
17      1
16      1
19      1
dtype: int64

### DW

In [10]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [11]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.299271821975708
Iteration completed in 0.3081319332122803
Iteration completed in 0.335813045501709
Iteration completed in 0.3376047611236572
Iteration completed in 0.3282322883605957
Iteration completed in 0.3439958095550537
Iteration completed in 0.328216552734375
Iteration completed in 0.30904364585876465
Iteration completed in 0.3335874080657959
Iteration completed in 0.327103853225708
Iteration completed in 0.3083534240722656
Iteration completed in 0.30677270889282227
Iteration completed in 0.3290553092956543
Iteration completed in 0.31028223037719727
Iteration completed in 0.3104691505432129


In [12]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.296154
std       0.066758
min       0.185897
25%       0.272436
50%       0.288462
75%       0.317308
max       0.455128
dtype: float64

### N2V

In [13]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.4137876033782959
Iteration completed in 0.4276108741760254
Iteration completed in 0.4390530586242676
Iteration completed in 0.4363119602203369
Iteration completed in 0.4129981994628906
Iteration completed in 0.4849510192871094
Iteration completed in 0.48203587532043457
Iteration completed in 0.47290873527526855
Iteration completed in 0.4900834560394287
Iteration completed in 0.45853209495544434
Iteration completed in 0.4732859134674072
Iteration completed in 0.5066800117492676
Iteration completed in 0.449962854385376
Iteration completed in 0.4869508743286133
Iteration completed in 0.45745062828063965
Iteration completed in 0.4613313674926758
Iteration completed in 0.4376678466796875
Iteration completed in 0.447357177734375
Iteration completed in 0.4313783645629883
Iteration completed in 0.4469592571258545
Iteration completed in 0.4098834991455078
Iteration completed in 0.4679415225982666
Iteration completed in 0.5456297397613525
Iteration completed in 0.4538569

In [15]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.311699
std       0.044283
min       0.243590
25%       0.286859
50%       0.301282
75%       0.330128
max       0.403846
dtype: float64

### MNMF

In [16]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [17]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.16550374031066895
Iteration completed in 0.15032577514648438
Iteration completed in 0.274747371673584
Iteration completed in 0.17885255813598633
Iteration completed in 0.19415283203125
Iteration completed in 0.2506673336029053
Iteration completed in 0.25342535972595215
Iteration completed in 0.1932356357574463
Iteration completed in 0.30477023124694824
Iteration completed in 0.2672114372253418


In [18]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.256410
std       0.037133
min       0.211538
25%       0.224359
50%       0.253205
75%       0.286859
max       0.320513
dtype: float64

# Heterogeneous intervals

In [19]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [20]:
pd.Series(node_labels).value_counts()

16    238
17    202
14     38
12     36
13     34
11     30
15     26
1      25
6      22
5      21
2      20
3      18
9      17
10     15
8      14
7      14
4       8
dtype: int64

### DW

In [21]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [22]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.31252360343933105
Iteration completed in 0.32099127769470215
Iteration completed in 0.3240213394165039
Iteration completed in 0.36571502685546875
Iteration completed in 0.3666250705718994
Iteration completed in 0.36431121826171875
Iteration completed in 0.34111881256103516
Iteration completed in 0.3543996810913086
Iteration completed in 0.33230113983154297
Iteration completed in 0.3613293170928955
Iteration completed in 0.39374470710754395
Iteration completed in 0.3287787437438965
Iteration completed in 0.3610525131225586
Iteration completed in 0.35608553886413574
Iteration completed in 0.3490777015686035


In [23]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.407265
std       0.045448
min       0.346154
25%       0.381410
50%       0.397436
75%       0.442308
max       0.506410
dtype: float64

### n2v

In [24]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.4228665828704834
Iteration completed in 0.42139267921447754
Iteration completed in 0.45281553268432617
Iteration completed in 0.4978973865509033
Iteration completed in 0.4205920696258545
Iteration completed in 0.4498732089996338
Iteration completed in 0.42426466941833496
Iteration completed in 0.4533689022064209
Iteration completed in 0.39727282524108887
Iteration completed in 0.4074556827545166
Iteration completed in 0.45449113845825195
Iteration completed in 0.45210766792297363
Iteration completed in 0.43412351608276367
Iteration completed in 0.5476129055023193
Iteration completed in 0.4880185127258301
Iteration completed in 0.4558396339416504
Iteration completed in 0.40175414085388184
Iteration completed in 0.4328458309173584
Iteration completed in 0.4191281795501709
Iteration completed in 0.4338982105255127
Iteration completed in 0.3952598571777344
Iteration completed in 0.44637203216552734
Iteration completed in 0.40953683853149414
Iteration completed in 0

In [26]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.351763
std       0.038290
min       0.301282
25%       0.323718
50%       0.346154
75%       0.373397
max       0.423077
dtype: float64

### MNMF

In [27]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [28]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.14519023895263672
Iteration completed in 0.12642455101013184
Iteration completed in 0.17573833465576172
Iteration completed in 0.15310192108154297
Iteration completed in 0.17406058311462402
Iteration completed in 0.15443634986877441
Iteration completed in 0.20692801475524902
Iteration completed in 0.16611409187316895
Iteration completed in 0.2604343891143799
Iteration completed in 0.20990395545959473


In [29]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.333974
std       0.033026
min       0.282051
25%       0.307692
50%       0.339744
75%       0.355769
max       0.384615
dtype: float64

# Heterogeneous pt2

In [30]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [31]:
pd.Series(node_labels).value_counts()

19    142
20    124
18    118
21     80
17     65
22     62
16     54
15     32
23     29
13     15
14     14
12     13
11     12
1       7
9       6
24      5
dtype: int64

### DW

In [32]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [33]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.2849886417388916
Iteration completed in 0.2868366241455078
Iteration completed in 0.30054140090942383
Iteration completed in 0.2961709499359131
Iteration completed in 0.2938823699951172
Iteration completed in 0.2963268756866455
Iteration completed in 0.29787278175354004
Iteration completed in 0.2833685874938965
Iteration completed in 0.29441356658935547
Iteration completed in 0.29135990142822266
Iteration completed in 0.28586864471435547
Iteration completed in 0.27988147735595703
Iteration completed in 0.29894351959228516
Iteration completed in 0.2904801368713379
Iteration completed in 0.2849128246307373


In [34]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.229915
std       0.048023
min       0.160256
25%       0.198718
50%       0.217949
75%       0.246795
max       0.326923
dtype: float64

### n2v

In [35]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [36]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3736896514892578
Iteration completed in 0.3753538131713867
Iteration completed in 0.39009952545166016
Iteration completed in 0.40202879905700684
Iteration completed in 0.35700535774230957
Iteration completed in 0.3771805763244629
Iteration completed in 0.3899869918823242
Iteration completed in 0.3930702209472656
Iteration completed in 0.36548900604248047
Iteration completed in 0.3800697326660156
Iteration completed in 0.3815004825592041
Iteration completed in 0.42017459869384766
Iteration completed in 0.3629329204559326
Iteration completed in 0.4016141891479492
Iteration completed in 0.3994274139404297
Iteration completed in 0.40811848640441895
Iteration completed in 0.3684062957763672
Iteration completed in 0.3872945308685303
Iteration completed in 0.3701956272125244
Iteration completed in 0.3710932731628418
Iteration completed in 0.3557734489440918
Iteration completed in 0.3712937831878662
Iteration completed in 0.38619542121887207
Iteration completed in 0.39

In [37]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.202457
std       0.033170
min       0.141026
25%       0.173077
50%       0.208333
75%       0.225962
max       0.256410
dtype: float64

### MNMF

In [38]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [39]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.14148449897766113
Iteration completed in 0.12594985961914062
Iteration completed in 0.16599416732788086
Iteration completed in 0.16077661514282227
Iteration completed in 0.17867159843444824
Iteration completed in 0.1594088077545166
Iteration completed in 0.20627856254577637
Iteration completed in 0.16901683807373047
Iteration completed in 0.26548266410827637
Iteration completed in 0.2748727798461914


In [40]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.178205
std       0.036237
min       0.115385
25%       0.157051
50%       0.176282
75%       0.200321
max       0.243590
dtype: float64