In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
for (x, y) in G.edges():
    if(x == y):
        if(H.has_node(x)):
            nodes.append(x)
            H.remove_node(x)

In [5]:
len(list(H.nodes()))

765

In [6]:
G = H
nx.is_weakly_connected(G)

True

In [7]:
i_time = time.time()
bet = nx.degree(G)
print(f"Calculated scores in: {time.time() - i_time}")

Calculated scores in: 0.0003108978271484375


In [8]:
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in sorted(list(map(lambda x: int(x), list(G.nodes())))):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [9]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [10]:
pd.Series(node_labels).value_counts()

3     97
2     90
1     88
4     81
6     68
5     65
7     56
8     49
9     46
11    28
10    26
13    19
12    19
14    10
17     7
16     6
15     4
20     3
18     2
19     1
dtype: int64

### DW

In [11]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [12]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.27218103408813477
Iteration completed in 0.28243541717529297
Iteration completed in 0.29552316665649414
Iteration completed in 0.2915635108947754
Iteration completed in 0.28584885597229004
Iteration completed in 0.28977131843566895
Iteration completed in 0.28878188133239746
Iteration completed in 0.26953911781311035
Iteration completed in 0.299330472946167
Iteration completed in 0.2744557857513428
Iteration completed in 0.2697122097015381
Iteration completed in 0.2557241916656494
Iteration completed in 0.29045844078063965
Iteration completed in 0.27034568786621094
Iteration completed in 0.2603938579559326


In [13]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.168192
std       0.033552
min       0.098039
25%       0.156863
50%       0.163399
75%       0.179739
max       0.235294
dtype: float64

### N2V

In [14]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [15]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3638577461242676
Iteration completed in 0.3693981170654297
Iteration completed in 0.3589906692504883
Iteration completed in 0.3762199878692627
Iteration completed in 0.332674503326416
Iteration completed in 0.3658132553100586
Iteration completed in 0.4037601947784424
Iteration completed in 0.42047572135925293
Iteration completed in 0.3996269702911377
Iteration completed in 0.39478135108947754
Iteration completed in 0.404498815536499
Iteration completed in 0.4113287925720215
Iteration completed in 0.3621945381164551
Iteration completed in 0.3419218063354492
Iteration completed in 0.4101681709289551
Iteration completed in 0.3806033134460449
Iteration completed in 0.3316948413848877
Iteration completed in 0.3601093292236328
Iteration completed in 0.38886380195617676
Iteration completed in 0.41223692893981934
Iteration completed in 0.3679540157318115
Iteration completed in 0.46564292907714844
Iteration completed in 0.38555288314819336
Iteration completed in 0.41933

In [16]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.152778
std       0.031717
min       0.084967
25%       0.137255
50%       0.150327
75%       0.169935
max       0.228758
dtype: float64

### MNMF

In [17]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.1424551010131836
Iteration completed in 0.13967466354370117
Iteration completed in 0.15543103218078613
Iteration completed in 0.15624213218688965
Iteration completed in 0.16641521453857422
Iteration completed in 0.14275074005126953
Iteration completed in 0.20796847343444824
Iteration completed in 0.15464544296264648
Iteration completed in 0.26006555557250977
Iteration completed in 0.18701720237731934


In [19]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.149020
std       0.033441
min       0.098039
25%       0.119281
50%       0.153595
75%       0.178105
max       0.189542
dtype: float64

# Heterogeneous pt1

In [31]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [32]:
pd.Series(node_labels).value_counts()

22    140
21    140
20    111
19    102
23     80
18     59
17     33
16     29
15     24
24     19
13      9
14      8
12      3
11      3
9       3
1       2
dtype: int64

### DW

In [33]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [34]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.2759988307952881
Iteration completed in 0.2521388530731201
Iteration completed in 0.2518603801727295
Iteration completed in 0.2647671699523926
Iteration completed in 0.27860212326049805
Iteration completed in 0.26677775382995605
Iteration completed in 0.263974666595459
Iteration completed in 0.233231782913208
Iteration completed in 0.24206900596618652
Iteration completed in 0.24114704132080078
Iteration completed in 0.23820829391479492
Iteration completed in 0.24171853065490723
Iteration completed in 0.27498316764831543
Iteration completed in 0.26043200492858887
Iteration completed in 0.24531173706054688


In [35]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.274946
std       0.061336
min       0.169935
25%       0.248366
50%       0.261438
75%       0.310458
max       0.398693
dtype: float64

### n2v

In [36]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [37]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3606724739074707
Iteration completed in 0.34815502166748047
Iteration completed in 0.35843920707702637
Iteration completed in 0.3523406982421875
Iteration completed in 0.335463285446167
Iteration completed in 0.38082265853881836
Iteration completed in 0.33692002296447754
Iteration completed in 0.38318347930908203
Iteration completed in 0.3617289066314697
Iteration completed in 0.34182143211364746
Iteration completed in 0.39231371879577637
Iteration completed in 0.3581521511077881
Iteration completed in 0.37534523010253906
Iteration completed in 0.3332240581512451
Iteration completed in 0.38286805152893066
Iteration completed in 0.37351393699645996
Iteration completed in 0.3610837459564209
Iteration completed in 0.3357861042022705
Iteration completed in 0.34099388122558594
Iteration completed in 0.3998734951019287
Iteration completed in 0.3372349739074707
Iteration completed in 0.3649473190307617
Iteration completed in 0.3724966049194336
Iteration completed in 0

In [38]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.197168
std       0.031473
min       0.137255
25%       0.176471
50%       0.189542
75%       0.215686
max       0.261438
dtype: float64

### MNMF

In [39]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [40]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.12952136993408203
Iteration completed in 0.18976616859436035
Iteration completed in 0.175001859664917
Iteration completed in 0.13951611518859863
Iteration completed in 0.15056800842285156
Iteration completed in 0.1333000659942627
Iteration completed in 0.17756962776184082
Iteration completed in 0.16294336318969727
Iteration completed in 0.24888944625854492
Iteration completed in 0.20513200759887695


In [41]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.198039
std       0.030818
min       0.143791
25%       0.178105
50%       0.196078
75%       0.225490
max       0.235294
dtype: float64

# Heterogeneous pt2

In [20]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [21]:
pd.Series(node_labels).value_counts()

17    409
16    178
14     27
13     20
11     17
5      17
12     13
15     13
3      12
8      12
7      11
9       8
1       8
6       7
10      5
4       5
2       3
dtype: int64

### DW

In [22]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [23]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.22881293296813965
Iteration completed in 0.25525665283203125
Iteration completed in 0.27629613876342773
Iteration completed in 0.2995433807373047
Iteration completed in 0.3013458251953125
Iteration completed in 0.2835540771484375
Iteration completed in 0.31252002716064453
Iteration completed in 0.26709508895874023
Iteration completed in 0.2981255054473877
Iteration completed in 0.3019700050354004
Iteration completed in 0.27458763122558594
Iteration completed in 0.27342820167541504
Iteration completed in 0.2912571430206299
Iteration completed in 0.2631535530090332
Iteration completed in 0.2405860424041748


In [24]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.578214
std       0.040507
min       0.503268
25%       0.555556
50%       0.568627
75%       0.601307
max       0.647059
dtype: float64

### n2v

In [25]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [26]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3538782596588135
Iteration completed in 0.39590930938720703
Iteration completed in 0.36240482330322266
Iteration completed in 0.3922398090362549
Iteration completed in 0.3437991142272949
Iteration completed in 0.3753204345703125
Iteration completed in 0.3381223678588867
Iteration completed in 0.3569834232330322
Iteration completed in 0.3650987148284912
Iteration completed in 0.3537936210632324
Iteration completed in 0.3769829273223877
Iteration completed in 0.38631772994995117
Iteration completed in 0.33023762702941895
Iteration completed in 0.3548152446746826
Iteration completed in 0.382030725479126
Iteration completed in 0.453432559967041
Iteration completed in 0.3271634578704834
Iteration completed in 0.3429272174835205
Iteration completed in 0.3784000873565674
Iteration completed in 0.3712177276611328
Iteration completed in 0.3274836540222168
Iteration completed in 0.3866567611694336
Iteration completed in 0.357851505279541
Iteration completed in 0.40704178

In [27]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.531318
std       0.030197
min       0.477124
25%       0.508170
50%       0.535948
75%       0.555556
max       0.581699
dtype: float64

### MNMF

In [28]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [29]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.12461280822753906
Iteration completed in 0.11029720306396484
Iteration completed in 0.14496374130249023
Iteration completed in 0.13577771186828613
Iteration completed in 0.16650390625
Iteration completed in 0.13886308670043945
Iteration completed in 0.17147469520568848
Iteration completed in 0.15219974517822266
Iteration completed in 0.23258614540100098
Iteration completed in 0.17820501327514648


In [30]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.547712
std       0.047312
min       0.483660
25%       0.517974
50%       0.542484
75%       0.566993
max       0.633987
dtype: float64