In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
for (x, y) in G.edges():
    if(x == y):
        if(H.has_node(x)):
            nodes.append(x)
            H.remove_node(x)

In [5]:
len(list(H.nodes()))

765

In [6]:
G = H
nx.is_weakly_connected(G)

True

In [7]:
i_time = time.time()
bet = G.in_degree()
print(f"Calculated scores in: {time.time() - i_time}")

Calculated scores in: 0.0001354217529296875


In [8]:
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in sorted(list(map(lambda x: int(x), list(G.nodes())))):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [10]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [11]:
pd.Series(node_labels).value_counts()

1     165
2     161
3     149
4      88
5      43
6      39
7      30
8      30
9      14
10     12
11      9
12      7
14      6
13      4
15      3
20      2
17      1
16      1
19      1
dtype: int64

### DW

In [12]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.24654316902160645
Iteration completed in 0.2576112747192383
Iteration completed in 0.272505521774292
Iteration completed in 0.27223753929138184
Iteration completed in 0.2602365016937256
Iteration completed in 0.27582216262817383
Iteration completed in 0.29796457290649414
Iteration completed in 0.28223657608032227
Iteration completed in 0.2653346061706543
Iteration completed in 0.2695770263671875
Iteration completed in 0.2530698776245117
Iteration completed in 0.2563817501068115
Iteration completed in 0.2936265468597412
Iteration completed in 0.26734161376953125
Iteration completed in 0.24229836463928223


In [14]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.289760
std       0.045996
min       0.215686
25%       0.261438
50%       0.287582
75%       0.307190
max       0.379085
dtype: float64

### N2V

In [15]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [16]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.35352301597595215
Iteration completed in 0.3415968418121338
Iteration completed in 0.33475804328918457
Iteration completed in 0.35523080825805664
Iteration completed in 0.3221273422241211
Iteration completed in 0.3366587162017822
Iteration completed in 0.3324713706970215
Iteration completed in 0.3344080448150635
Iteration completed in 0.3212916851043701
Iteration completed in 0.33933377265930176
Iteration completed in 0.339296817779541
Iteration completed in 0.3845827579498291
Iteration completed in 0.3169858455657959
Iteration completed in 0.3446824550628662
Iteration completed in 0.3399641513824463
Iteration completed in 0.3594377040863037
Iteration completed in 0.3243885040283203
Iteration completed in 0.3229367733001709
Iteration completed in 0.3246495723724365
Iteration completed in 0.32070112228393555
Iteration completed in 0.3165433406829834
Iteration completed in 0.3204953670501709
Iteration completed in 0.34110140800476074
Iteration completed in 0.3583

In [17]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.332789
std       0.055490
min       0.241830
25%       0.285948
50%       0.333333
75%       0.380719
max       0.431373
dtype: float64

### MNMF

In [18]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [19]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.13788318634033203
Iteration completed in 0.11122369766235352
Iteration completed in 0.1442875862121582
Iteration completed in 0.12641429901123047
Iteration completed in 0.17005062103271484
Iteration completed in 0.1298518180847168
Iteration completed in 0.1915268898010254
Iteration completed in 0.1527571678161621
Iteration completed in 0.22914433479309082
Iteration completed in 0.19404339790344238


In [20]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.232026
std       0.025078
min       0.196078
25%       0.217320
50%       0.228758
75%       0.246732
max       0.274510
dtype: float64

# Heterogeneous 1

In [32]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [33]:
pd.Series(node_labels).value_counts()

19    141
20    124
18    115
21     80
17     65
22     62
16     51
15     31
23     29
13     13
14     13
12     12
11     11
1       7
9       6
24      5
dtype: int64

### DW

In [34]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [35]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.25851869583129883
Iteration completed in 0.2527792453765869
Iteration completed in 0.27379393577575684
Iteration completed in 0.2560579776763916
Iteration completed in 0.27258801460266113
Iteration completed in 0.2567927837371826
Iteration completed in 0.27837300300598145
Iteration completed in 0.258469820022583
Iteration completed in 0.25373268127441406
Iteration completed in 0.24547505378723145
Iteration completed in 0.2426302433013916
Iteration completed in 0.2691490650177002
Iteration completed in 0.278214693069458
Iteration completed in 0.27318429946899414
Iteration completed in 0.26003456115722656


In [36]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.237037
std       0.057854
min       0.150327
25%       0.209150
50%       0.222222
75%       0.284314
max       0.346405
dtype: float64

### n2v

In [37]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [38]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.32639288902282715
Iteration completed in 0.33856964111328125
Iteration completed in 0.3426971435546875
Iteration completed in 0.35269665718078613
Iteration completed in 0.31516098976135254
Iteration completed in 0.3475947380065918
Iteration completed in 0.3294539451599121
Iteration completed in 0.3276703357696533
Iteration completed in 0.3211860656738281
Iteration completed in 0.34106016159057617
Iteration completed in 0.33518481254577637
Iteration completed in 0.3564465045928955
Iteration completed in 0.31786060333251953
Iteration completed in 0.33294200897216797
Iteration completed in 0.33422112464904785
Iteration completed in 0.3557465076446533
Iteration completed in 0.30820322036743164
Iteration completed in 0.3455936908721924
Iteration completed in 0.3370976448059082
Iteration completed in 0.3516409397125244
Iteration completed in 0.35005712509155273
Iteration completed in 0.3590831756591797
Iteration completed in 0.3685469627380371
Iteration completed in 

In [39]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.214597
std       0.039624
min       0.137255
25%       0.189542
50%       0.215686
75%       0.248366
max       0.287582
dtype: float64

### MNMF

In [40]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [41]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.15505003929138184
Iteration completed in 0.10817122459411621
Iteration completed in 0.13643169403076172
Iteration completed in 0.1370072364807129
Iteration completed in 0.16063690185546875
Iteration completed in 0.12598323822021484
Iteration completed in 0.17140674591064453
Iteration completed in 0.16729497909545898
Iteration completed in 0.2183825969696045
Iteration completed in 0.174393892288208


In [42]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.184314
std       0.014052
min       0.150327
25%       0.183007
50%       0.186275
75%       0.189542
max       0.202614
dtype: float64

# Heterogeneous intervals 2

In [21]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [22]:
pd.Series(node_labels).value_counts()

16    237
17    202
14     38
12     35
13     32
11     30
15     26
1      24
6      21
5      20
2      18
9      17
3      16
10     15
8      13
7      13
4       8
dtype: int64

### DW

In [23]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.2525479793548584
Iteration completed in 0.24192595481872559
Iteration completed in 0.2614917755126953
Iteration completed in 0.2535409927368164
Iteration completed in 0.2695155143737793
Iteration completed in 0.27206850051879883
Iteration completed in 0.2527017593383789
Iteration completed in 0.25714707374572754
Iteration completed in 0.3582453727722168
Iteration completed in 0.2435436248779297
Iteration completed in 0.268873929977417
Iteration completed in 0.2517094612121582
Iteration completed in 0.27442383766174316
Iteration completed in 0.2716035842895508
Iteration completed in 0.2529287338256836


In [25]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.409150
std       0.045471
min       0.346405
25%       0.366013
50%       0.405229
75%       0.444444
max       0.490196
dtype: float64

### n2v

In [26]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [27]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.32214832305908203
Iteration completed in 0.3179197311401367
Iteration completed in 0.33422303199768066
Iteration completed in 0.3534564971923828
Iteration completed in 0.3338582515716553
Iteration completed in 0.38266777992248535
Iteration completed in 0.3479161262512207
Iteration completed in 0.3660857677459717
Iteration completed in 0.3128979206085205
Iteration completed in 0.3591783046722412
Iteration completed in 0.3278532028198242
Iteration completed in 0.3395109176635742
Iteration completed in 0.31576085090637207
Iteration completed in 0.3453257083892822
Iteration completed in 0.3381476402282715
Iteration completed in 0.35620737075805664
Iteration completed in 0.3108043670654297
Iteration completed in 0.3351771831512451
Iteration completed in 0.3154411315917969
Iteration completed in 0.3131139278411865
Iteration completed in 0.3453695774078369
Iteration completed in 0.34130167961120605
Iteration completed in 0.3306305408477783
Iteration completed in 0.363

In [28]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.361111
std       0.049823
min       0.281046
25%       0.336601
50%       0.352941
75%       0.393791
max       0.450980
dtype: float64

### MNMF

In [29]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [30]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.1311168670654297
Iteration completed in 0.11013054847717285
Iteration completed in 0.1433265209197998
Iteration completed in 0.13079261779785156
Iteration completed in 0.1483612060546875
Iteration completed in 0.1319727897644043
Iteration completed in 0.19405341148376465
Iteration completed in 0.18991827964782715
Iteration completed in 0.22824597358703613
Iteration completed in 0.17713475227355957


In [31]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.333987
std       0.032672
min       0.287582
25%       0.315359
50%       0.326797
75%       0.362745
max       0.385621
dtype: float64