In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph)

In [3]:
len(list(G.nodes()))

956

In [4]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [5]:
len(list(G.nodes()))

778

In [6]:
i_time = time.time()
bet = nx.pagerank(G)
print(f"Calulated scores in: {time.time() - i_time}")

Calulated scores in: 0.9073235988616943


In [7]:
bet_l = []
for i in sorted(list(map(lambda x: int(x), bet.keys()))):
    bet_norm = (bet[str(i)] - min(bet.values())) / (max(bet.values()) - min(bet.values())) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [8]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [9]:
pd.Series(node_labels).value_counts()

1     311
2     179
3      82
4      57
5      46
6      27
7      20
8      16
9      10
10      8
12      5
11      5
14      4
13      4
19      2
15      1
20      1
dtype: int64

### DW

In [10]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [11]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.3543095588684082
Iteration completed in 0.38189268112182617
Iteration completed in 0.40067601203918457
Iteration completed in 0.39165449142456055
Iteration completed in 0.3582024574279785
Iteration completed in 0.3560941219329834
Iteration completed in 0.35494375228881836
Iteration completed in 0.387345552444458
Iteration completed in 0.4285311698913574
Iteration completed in 0.36562228202819824
Iteration completed in 0.3504488468170166
Iteration completed in 0.3730461597442627
Iteration completed in 0.41281747817993164
Iteration completed in 0.34911060333251953
Iteration completed in 0.36880946159362793


In [12]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.415385
std       0.069684
min       0.333333
25%       0.355769
50%       0.410256
75%       0.442308
max       0.570513
dtype: float64

### N2V

In [13]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.44329214096069336
Iteration completed in 0.4456775188446045
Iteration completed in 0.4383671283721924
Iteration completed in 0.45079565048217773
Iteration completed in 0.48334622383117676
Iteration completed in 0.4698023796081543
Iteration completed in 0.42504310607910156
Iteration completed in 0.44639134407043457
Iteration completed in 0.42966771125793457
Iteration completed in 0.4371674060821533
Iteration completed in 0.44211912155151367
Iteration completed in 0.5249383449554443
Iteration completed in 0.44618678092956543
Iteration completed in 0.4815387725830078
Iteration completed in 0.44428205490112305
Iteration completed in 0.4655330181121826
Iteration completed in 0.5114521980285645
Iteration completed in 0.4480862617492676
Iteration completed in 0.419126033782959
Iteration completed in 0.4262208938598633
Iteration completed in 0.46944212913513184
Iteration completed in 0.4470548629760742
Iteration completed in 0.5006203651428223
Iteration completed in 0.

In [15]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.442041
std       0.055994
min       0.307692
25%       0.421474
50%       0.435897
75%       0.477564
max       0.557692
dtype: float64

### MNMF

In [16]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [17]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.18442296981811523
Iteration completed in 0.1443800926208496
Iteration completed in 0.2022252082824707
Iteration completed in 0.1674809455871582
Iteration completed in 0.20355701446533203
Iteration completed in 0.1739799976348877
Iteration completed in 0.26006460189819336
Iteration completed in 0.19372105598449707
Iteration completed in 0.2930772304534912
Iteration completed in 0.2545437812805176


In [18]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.372436
std       0.044355
min       0.294872
25%       0.341346
50%       0.371795
75%       0.399038
max       0.442308
dtype: float64

# Heterogeneous intervals

In [19]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [20]:
pd.Series(node_labels).value_counts()

17    149
16    139
11     58
2      42
1      42
12     36
9      35
5      35
7      34
13     32
3      30
15     30
6      26
8      26
14     23
10     22
4      19
dtype: int64

### DW

In [21]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [22]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.36212873458862305
Iteration completed in 0.3689277172088623
Iteration completed in 0.4069633483886719
Iteration completed in 0.415912389755249
Iteration completed in 0.40134572982788086
Iteration completed in 0.3751096725463867
Iteration completed in 0.40749692916870117
Iteration completed in 0.3890421390533447
Iteration completed in 0.3742647171020508
Iteration completed in 0.39904189109802246
Iteration completed in 0.394960880279541
Iteration completed in 0.3772733211517334
Iteration completed in 0.41410040855407715
Iteration completed in 0.4192328453063965
Iteration completed in 0.4034121036529541


In [23]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.250000
std       0.043476
min       0.192308
25%       0.208333
50%       0.250000
75%       0.285256
max       0.346154
dtype: float64

### n2v

In [24]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.45911598205566406
Iteration completed in 0.5042409896850586
Iteration completed in 0.6048874855041504
Iteration completed in 0.5467097759246826
Iteration completed in 0.46938538551330566
Iteration completed in 0.5464363098144531
Iteration completed in 0.5016546249389648
Iteration completed in 0.595038652420044
Iteration completed in 0.509493350982666
Iteration completed in 0.47056007385253906
Iteration completed in 0.4753129482269287
Iteration completed in 0.5772325992584229
Iteration completed in 0.4831860065460205
Iteration completed in 0.5410900115966797
Iteration completed in 0.5261683464050293
Iteration completed in 0.5559730529785156
Iteration completed in 0.461575984954834
Iteration completed in 0.5565006732940674
Iteration completed in 0.48111486434936523
Iteration completed in 0.48660826683044434
Iteration completed in 0.49615979194641113
Iteration completed in 0.5012013912200928
Iteration completed in 0.49604272842407227
Iteration completed in 0.41383

In [26]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.258013
std       0.032645
min       0.192308
25%       0.240385
50%       0.262821
75%       0.282051
max       0.314103
dtype: float64

### MNMF

In [27]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [28]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.1792755126953125
Iteration completed in 0.12824773788452148
Iteration completed in 0.20001983642578125
Iteration completed in 0.15227055549621582
Iteration completed in 0.2061614990234375
Iteration completed in 0.1539909839630127
Iteration completed in 0.22612619400024414
Iteration completed in 0.1838381290435791
Iteration completed in 0.32134532928466797
Iteration completed in 0.22861623764038086


In [29]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.215385
std       0.037034
min       0.166667
25%       0.184295
50%       0.211538
75%       0.246795
max       0.275641
dtype: float64

# Heterogeneous pt2

In [30]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [31]:
pd.Series(node_labels).value_counts()

17    107
18    100
16     90
20     84
19     79
21     64
15     59
22     41
13     31
14     28
12     24
23     19
11     14
9       9
10      8
1       7
8       4
24      3
7       3
6       2
5       1
4       1
dtype: int64

### DW

In [32]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [33]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.34056854248046875
Iteration completed in 0.33775758743286133
Iteration completed in 0.3386092185974121
Iteration completed in 0.38820576667785645
Iteration completed in 0.3239400386810303
Iteration completed in 0.360095739364624
Iteration completed in 0.38738155364990234
Iteration completed in 0.3567829132080078
Iteration completed in 0.3655729293823242
Iteration completed in 0.35634541511535645
Iteration completed in 0.31862378120422363
Iteration completed in 0.31715917587280273
Iteration completed in 0.3492605686187744
Iteration completed in 0.3495168685913086
Iteration completed in 0.3321115970611572


In [34]:
pd.Series(f1_scores).describe()

count    15.000000
mean      0.195299
std       0.043264
min       0.115385
25%       0.169872
50%       0.198718
75%       0.217949
max       0.262821
dtype: float64

### n2v

In [35]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [36]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.4037504196166992
Iteration completed in 0.4237029552459717
Iteration completed in 0.48894810676574707
Iteration completed in 0.46711087226867676
Iteration completed in 0.400134801864624
Iteration completed in 0.4282186031341553
Iteration completed in 0.46228766441345215
Iteration completed in 0.45159173011779785
Iteration completed in 0.4017024040222168
Iteration completed in 0.4266784191131592
Iteration completed in 0.4306297302246094
Iteration completed in 0.49134349822998047
Iteration completed in 0.4301762580871582
Iteration completed in 0.47010254859924316
Iteration completed in 0.45470428466796875
Iteration completed in 0.4846374988555908
Iteration completed in 0.42327260971069336
Iteration completed in 0.4246492385864258
Iteration completed in 0.42391419410705566
Iteration completed in 0.4178769588470459
Iteration completed in 0.43336057662963867
Iteration completed in 0.4458785057067871
Iteration completed in 0.42458295822143555
Iteration completed in 0

In [37]:
pd.Series(f1_scores).describe()

count    24.000000
mean      0.186165
std       0.034828
min       0.115385
25%       0.166667
50%       0.192308
75%       0.211538
max       0.243590
dtype: float64

### MNMF

In [38]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1 

In [39]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
for i in range(tests):
    start_time = time.time()
    data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    # Input 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    
    X_data = [X_data[i] for i in range(len(X_data)) if str(i + 1) not in nodes]
    
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 0.1682109832763672
Iteration completed in 0.1261141300201416
Iteration completed in 0.27951693534851074
Iteration completed in 0.16361117362976074
Iteration completed in 0.1800229549407959
Iteration completed in 0.21317648887634277
Iteration completed in 0.2538766860961914
Iteration completed in 0.20438814163208008
Iteration completed in 0.28815770149230957
Iteration completed in 0.21579289436340332


In [40]:
pd.Series(f1_scores).describe()

count    10.000000
mean      0.180128
std       0.031031
min       0.153846
25%       0.161859
50%       0.166667
75%       0.187500
max       0.256410
dtype: float64