In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
for (x, y) in G.edges():
    if(x == y):
        if(H.has_node(x)):
            nodes.append(x)
            H.remove_node(x)

In [5]:
len(list(H.nodes()))

765

In [6]:
G = H
nx.is_weakly_connected(G)

True

In [7]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Calulated scores in: {time.time() - i_time}")

Calulated scores in: 0.13361287117004395


In [8]:
bet_l = []
for i in sorted(list(map(lambda x: int(x), bet.keys()))):
    bet_norm = (bet[str(i)] - min(bet.values())) / (max(bet.values()) - min(bet.values())) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [9]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [10]:
pd.Series(node_labels).value_counts()

1     311
2     179
3      77
4      54
5      44
6      28
7      19
8      14
9      10
10      8
12      5
11      5
14      4
13      4
19      2
20      1
dtype: int64

### DW

In [14]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [15]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3168001174926758
Iteration completed in 0.3383195400238037
Iteration completed in 0.36801981925964355
Iteration completed in 0.3494434356689453
Iteration completed in 0.30816006660461426
Iteration completed in 0.3287327289581299
Iteration completed in 0.3593127727508545
Iteration completed in 0.3139796257019043
Iteration completed in 0.323289155960083
Iteration completed in 0.33838963508605957
Iteration completed in 0.31974124908447266
Iteration completed in 0.3145172595977783
Iteration completed in 0.3310434818267822
Iteration completed in 0.33362770080566406
Iteration completed in 0.32793402671813965


In [16]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.429194
std       0.063111
min       0.346405
25%       0.392157
50%       0.411765
75%       0.457516
max       0.555556
dtype: float64

### N2V

In [17]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.4507613182067871
Iteration completed in 0.4589357376098633
Iteration completed in 0.4955623149871826
Iteration completed in 0.5003542900085449
Iteration completed in 0.43218302726745605
Iteration completed in 0.431018590927124
Iteration completed in 0.4127950668334961
Iteration completed in 0.4180905818939209
Iteration completed in 0.4278433322906494
Iteration completed in 0.41318178176879883
Iteration completed in 0.4250926971435547
Iteration completed in 0.4487278461456299
Iteration completed in 0.41500043869018555
Iteration completed in 0.44696998596191406
Iteration completed in 0.42771029472351074
Iteration completed in 0.4735865592956543
Iteration completed in 0.43718719482421875
Iteration completed in 0.4444589614868164
Iteration completed in 0.4309401512145996
Iteration completed in 0.41256117820739746
Iteration completed in 0.4790666103363037
Iteration completed in 0.44350624084472656
Iteration completed in 0.43155884742736816
Iteration completed in 0.4

In [19]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.448529
std       0.050950
min       0.359477
25%       0.424837
50%       0.450980
75%       0.480392
max       0.542484
dtype: float64

### MNMF

In [20]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [21]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.16991567611694336
Iteration completed in 0.14804863929748535
Iteration completed in 0.19092488288879395
Iteration completed in 0.1719191074371338
Iteration completed in 0.21002745628356934
Iteration completed in 0.17488861083984375
Iteration completed in 0.2671515941619873
Iteration completed in 0.19577288627624512
Iteration completed in 0.3003864288330078
Iteration completed in 0.2559852600097656


In [22]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.376471
std       0.053205
min       0.307190
25%       0.343137
50%       0.359477
75%       0.424837
max       0.457516
dtype: float64

# Heterogeneous intervals

In [23]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [24]:
pd.Series(node_labels).value_counts()

17    144
16    131
11     58
2      42
1      42
9      35
12     35
5      34
13     32
15     31
7      31
8      29
3      28
6      28
14     23
10     22
4      20
dtype: int64

### DW

In [25]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [26]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.32735514640808105
Iteration completed in 0.33884477615356445
Iteration completed in 0.35144782066345215
Iteration completed in 0.36485886573791504
Iteration completed in 0.368070125579834
Iteration completed in 0.36125874519348145
Iteration completed in 0.383772611618042
Iteration completed in 0.3374803066253662
Iteration completed in 0.3711891174316406
Iteration completed in 0.34773731231689453
Iteration completed in 0.34731388092041016
Iteration completed in 0.3316009044647217
Iteration completed in 0.389754056930542
Iteration completed in 0.3596973419189453
Iteration completed in 0.3729276657104492


In [27]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.261874
std       0.042391
min       0.209150
25%       0.235294
50%       0.254902
75%       0.274510
max       0.359477
dtype: float64

### n2v

In [28]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [29]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.43714141845703125
Iteration completed in 0.44692015647888184
Iteration completed in 0.47598981857299805
Iteration completed in 0.46168971061706543
Iteration completed in 0.45447659492492676
Iteration completed in 0.44535088539123535
Iteration completed in 0.46672654151916504
Iteration completed in 0.4646036624908447
Iteration completed in 0.4355947971343994
Iteration completed in 0.464827299118042
Iteration completed in 0.4774806499481201
Iteration completed in 0.5011775493621826
Iteration completed in 0.4433934688568115
Iteration completed in 0.47899937629699707
Iteration completed in 0.4632744789123535
Iteration completed in 0.5172789096832275
Iteration completed in 0.44228649139404297
Iteration completed in 0.46521973609924316
Iteration completed in 0.4242088794708252
Iteration completed in 0.45087122917175293
Iteration completed in 0.4467349052429199
Iteration completed in 0.43807005882263184
Iteration completed in 0.47579097747802734
Iteration completed in

In [30]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.252451
std       0.044237
min       0.189542
25%       0.215686
50%       0.251634
75%       0.274510
max       0.359477
dtype: float64

### MNMF

In [31]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [32]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.19571614265441895
Iteration completed in 0.1425168514251709
Iteration completed in 0.21402549743652344
Iteration completed in 0.17266225814819336
Iteration completed in 0.20242524147033691
Iteration completed in 0.18613219261169434
Iteration completed in 0.24089765548706055
Iteration completed in 0.2017061710357666
Iteration completed in 0.3418102264404297
Iteration completed in 0.26704859733581543


In [33]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.201307
std       0.026469
min       0.169935
25%       0.184641
50%       0.192810
75%       0.217320
max       0.254902
dtype: float64

# Heterogeneous pt2

In [34]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [35]:
pd.Series(node_labels).value_counts()

17    108
18     97
16     91
20     78
19     77
21     64
15     57
22     40
13     31
14     28
12     24
23     18
11     15
9       9
1       7
10      7
8       4
24      3
7       3
6       2
5       1
4       1
dtype: int64

### DW

In [36]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [37]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.36584997177124023
Iteration completed in 0.3731415271759033
Iteration completed in 0.3754146099090576
Iteration completed in 0.36385154724121094
Iteration completed in 0.3338139057159424
Iteration completed in 0.3560631275177002
Iteration completed in 0.3607008457183838
Iteration completed in 0.3416624069213867
Iteration completed in 0.35385870933532715
Iteration completed in 0.35332155227661133
Iteration completed in 0.3452425003051758
Iteration completed in 0.33724546432495117
Iteration completed in 0.35819554328918457
Iteration completed in 0.3450198173522949
Iteration completed in 0.349193811416626


In [38]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.180392
std       0.035867
min       0.104575
25%       0.169935
50%       0.189542
75%       0.202614
max       0.235294
dtype: float64

### n2v

In [39]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [40]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.4742891788482666
Iteration completed in 0.4867398738861084
Iteration completed in 0.47971224784851074
Iteration completed in 0.536107063293457
Iteration completed in 0.46451258659362793
Iteration completed in 0.47969722747802734
Iteration completed in 0.4712657928466797
Iteration completed in 0.5013747215270996
Iteration completed in 0.4841463565826416
Iteration completed in 0.4869992733001709
Iteration completed in 0.4932670593261719
Iteration completed in 0.5495631694793701
Iteration completed in 0.48188066482543945
Iteration completed in 0.5142886638641357
Iteration completed in 0.48383283615112305
Iteration completed in 0.5235030651092529
Iteration completed in 0.46625304222106934
Iteration completed in 0.47777295112609863
Iteration completed in 0.4510972499847412
Iteration completed in 0.48413753509521484
Iteration completed in 0.4723539352416992
Iteration completed in 0.4845099449157715
Iteration completed in 0.47110724449157715
Iteration completed in 0.4

In [41]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.176471
std       0.032878
min       0.104575
25%       0.156863
50%       0.169935
75%       0.204248
max       0.228758
dtype: float64

### MNMF

In [42]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [43]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.186173677444458
Iteration completed in 0.20120882987976074
Iteration completed in 0.20772314071655273
Iteration completed in 0.17999577522277832
Iteration completed in 0.20346450805664062
Iteration completed in 0.18568158149719238
Iteration completed in 0.24446558952331543
Iteration completed in 0.2107548713684082
Iteration completed in 0.3337748050689697
Iteration completed in 0.2646608352661133


In [44]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.143137
std       0.040578
min       0.098039
25%       0.112745
50%       0.127451
75%       0.176471
max       0.209150
dtype: float64