In [49]:
# Requirements
import pandas as pd
import numpy as np
import networkx as nx

import pickle
import os

from node2vec import Node2Vec
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import networkx as nx

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from xgboost import XGBClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import random
random.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [50]:
import sys
sys.path.append('/src')

from node2vecfunc import *
from mlmethodsligh import *

In [51]:
# Include the label of the addresses
# Fraudulent
output_labels = pickle.load(open("../../bse_clovrlabs_btc_fraud/data/output_labels.txt",'rb'))
input_labels = pickle.load(open("../../bse_clovrlabs_btc_fraud/data/input_labels.txt",'rb'))
fraudulent_nodes = list(set(list(output_labels) + list(input_labels)))

***Sampling method of randomly selected nodes***

In [52]:
rn = ['outputs/random_nodes/' + name for name in os.listdir('outputs/random_nodes/') if name.endswith('csv')]
mm = ['outputs/mixed_method/' + name for name in os.listdir('outputs/mixed_method/') if name.endswith('csv')]
rw = ['outputs/random_walk/' + name for name in os.listdir('outputs/random_walk/') if name.endswith('csv')]
m2 = ['outputs/m2/' + name for name in os.listdir('outputs/m2/') if name.endswith('csv')]
nm = ['outputs/neighbors_method/' + name for name in os.listdir('outputs/neighbors_method/') if name.endswith('csv')]
db = ['outputs/biased_degree/' + name for name in os.listdir('outputs/biased_degree/') if name.endswith('csv')]

***Compute the performace using random nodes (CS) method with node2vec***

In [36]:
all_results = [] # Save results

for file in rn:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_n2v = df_node2vec(df, fraudulent_nodes)
    
    df_n2v.to_csv('outputs/Embeddings/node2vec/' + file.replace('outputs/', ''))

    X = df_n2v.drop('label', axis=1)
    y = df_n2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/RN_word2vec_results_ligh.csv', index=False)

Doing:   outputs/random_nodes/g_ss_rn_1.csv ...
Doing:   outputs/random_nodes/g_ss_rn_7.csv ...
Doing:   outputs/random_nodes/g_ss_rn_2.csv ...
Doing:   outputs/random_nodes/g_ss_rn_5.csv ...
Doing:   outputs/random_nodes/g_ss_rn_4.csv ...
Doing:   outputs/random_nodes/g_ss_rn_6.csv ...
Doing:   outputs/random_nodes/g_ss_rn_3.csv ...
Doing:   outputs/random_nodes/g_ss_rn_9.csv ...
Doing:   outputs/random_nodes/g_ss_rn_0.csv ...
Doing:   outputs/random_nodes/g_ss_rn_8.csv ...


***Compute the performace using the mixed method with node2vec (MS)***

In [None]:
all_results = [] # Save results

for file in mm:
    df = pd.read_csv(file)
    df_n2v = df_node2vec(df, fraudulent_nodes)
    df_n2v.to_csv('outputs/Embeddings/node2vec/' + file.replace('outputs/', ''))
    
    X = df_n2v.drop('label', axis=1)
    y = df_n2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/MM_word2vec_results_ligh.csv', index=False)

***Compute the performace using the random walk method (RWS) with node2vec***

In [17]:
all_results = [] # Save results


for file in rw:
    df = pd.read_csv(file)
    df_n2v = df_node2vec(df, fraudulent_nodes)
    df_n2v.to_csv('outputs/Embeddings/node2vec/' + file.replace('outputs/', ''))
    
    X = df_n2v.drop('label', axis=1)
    y = df_n2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/RW_word2vec_results_ligh.csv', index=False)

***Compute the performace using the neighbors method (NS) with node2vec***

Only for the 2 smaller graphs 

In [5]:
filtered = [file for file in nm if file not in ['outputs/neighbors_method/g_ss_ne_0.csv', 'outputs/neighbors_method/g_ss_ne_3.csv', 
                                                'outputs/neighbors_method/g_ss_ne_5.csv', 'outputs/neighbors_method/g_ss_ne_9.csv', 
                                                'outputs/neighbors_method/g_ss_ne_2.csv', 'outputs/neighbors_method/g_ss_ne_4.csv',
                                                'outputs/neighbors_method/g_ss_ne_6.csv', 'outputs/neighbors_method/g_ss_ne_7.csv']]
filtered

['outputs/neighbors_method/g_ss_ne_8.csv',
 'outputs/neighbors_method/g_ss_ne_1.csv']

In [6]:
all_results = [] # Save results

for file in filtered:
    df = pd.read_csv(file)
    df_n2v = df_node2vec(df, fraudulent_nodes)
    df_n2v.to_csv('outputs/Embeddings/node2vec/' + file.replace('outputs/', ''), index=False)

    X = df_n2v.drop('label', axis=1)
    y = df_n2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/NM_word2vec_results_ligh.csv', index=False)

***Compute the performace using the degree biased random sampling method (DB) with node2vec***

In [17]:
for file in db:
    df = pd.read_csv(file)
    nodes = len(set(list(df['source'].unique()) + list(df['target'].unique())))
    print(file, nodes, df.shape[0])

outputs/biased_degree/g_ss_bd_0.csv 30394 314789
outputs/biased_degree/g_ss_bd_8.csv 30473 318844
outputs/biased_degree/g_ss_bd_9.csv 30658 318854
outputs/biased_degree/g_ss_bd_4.csv 30832 320283
outputs/biased_degree/g_ss_bd_3.csv 30451 324941
outputs/biased_degree/g_ss_bd_6.csv 30416 314772
outputs/biased_degree/g_ss_bd_5.csv 30450 312710
outputs/biased_degree/g_ss_bd_2.csv 30410 309528
outputs/biased_degree/g_ss_bd_7.csv 30496 325433
outputs/biased_degree/g_ss_bd_1.csv 30410 309528


In [21]:
all_results = [] # Save results

for file in db:
    print('Doing:', file, '...')
    df = pd.read_csv(file)
    df_n2v = df_node2vec(df, fraudulent_nodes)
    df_n2v.to_csv('outputs/Embeddings/node2vec/' + file.replace('outputs/', ''))

    X = df_n2v.drop('label', axis=1)
    y = df_n2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/DB_word2vec_results_ligh.csv', index=False)

Doing: outputs/biased_degree/g_ss_bd_0.csv ...
Doing: outputs/biased_degree/g_ss_bd_8.csv ...
Doing: outputs/biased_degree/g_ss_bd_9.csv ...
Doing: outputs/biased_degree/g_ss_bd_4.csv ...
Doing: outputs/biased_degree/g_ss_bd_3.csv ...
Doing: outputs/biased_degree/g_ss_bd_6.csv ...
Doing: outputs/biased_degree/g_ss_bd_5.csv ...
Doing: outputs/biased_degree/g_ss_bd_2.csv ...
Doing: outputs/biased_degree/g_ss_bd_7.csv ...
Doing: outputs/biased_degree/g_ss_bd_1.csv ...


In [22]:
final_results

Unnamed: 0,F1,Precision,Recall,model,graph
0,0.226639,0.138239,0.628627,Logit,outputs/biased_degree/g_ss_bd_0
1,0.219004,0.133967,0.599613,Logit,outputs/biased_degree/g_ss_bd_0
2,0.219936,0.134666,0.599613,Logit,outputs/biased_degree/g_ss_bd_0
3,0.222533,0.135897,0.613900,Logit,outputs/biased_degree/g_ss_bd_0
4,0.219057,0.132812,0.624758,Logit,outputs/biased_degree/g_ss_bd_0
...,...,...,...,...,...
0,0.479042,0.500000,0.459770,RF,outputs/biased_degree/g_ss_bd_1
1,0.457953,0.486022,0.432950,RF,outputs/biased_degree/g_ss_bd_1
2,0.461860,0.508046,0.423372,RF,outputs/biased_degree/g_ss_bd_1
3,0.473738,0.513393,0.439771,RF,outputs/biased_degree/g_ss_bd_1


***Average result for each method of sampling and each model***

In [39]:
files = [file for file in os.listdir('outputs/results') if file.endswith('ligh.csv')]
dfs_ = []
for file in files:
    dfs_.append(pd.read_csv('outputs/results/' + file)) 

df = pd.concat(dfs_)
print(df.shape)
df.head()

(520, 5)


Unnamed: 0,F1,Precision,Recall,model,graph
0,0.034965,0.017994,0.615385,Logit,outputs/random_walk/g_ss_rw_2
1,0.039184,0.020126,0.738462,Logit,outputs/random_walk/g_ss_rw_2
2,0.041791,0.021501,0.742424,Logit,outputs/random_walk/g_ss_rw_2
3,0.040227,0.020711,0.69697,Logit,outputs/random_walk/g_ss_rw_2
4,0.037879,0.019481,0.681818,Logit,outputs/random_walk/g_ss_rw_2


In [40]:
# Group by graph and model model (Best_params are the same for all of the 5 runs on each model)
graph_l = df.groupby(['graph', 'model']).mean().reset_index().sort_values('F1', ascending=False)
graph_l.head()

Unnamed: 0,graph,model,F1,Precision,Recall
69,outputs/random_nodes/g_ss_rn_2,RF,0.50653,0.609524,0.433937
77,outputs/random_nodes/g_ss_rn_6,RF,0.503523,0.594705,0.437118
79,outputs/random_nodes/g_ss_rn_7,RF,0.503487,0.604951,0.432391
71,outputs/random_nodes/g_ss_rn_3,RF,0.493645,0.598295,0.420796
75,outputs/random_nodes/g_ss_rn_5,RF,0.490157,0.589274,0.419666


In [41]:
graph_l[graph_l['graph'].str.contains('outputs/biased_degree')].sort_values('F1', ascending=False)

Unnamed: 0,graph,model,F1,Precision,Recall
3,outputs/biased_degree/g_ss_bd_1,RF,0.471903,0.504252,0.443716
5,outputs/biased_degree/g_ss_bd_2,RF,0.467626,0.495453,0.442956
19,outputs/biased_degree/g_ss_bd_9,RF,0.45931,0.4803,0.440458
11,outputs/biased_degree/g_ss_bd_5,RF,0.45797,0.484452,0.434483
1,outputs/biased_degree/g_ss_bd_0,RF,0.455136,0.478772,0.434265
9,outputs/biased_degree/g_ss_bd_4,RF,0.444419,0.468388,0.422863
13,outputs/biased_degree/g_ss_bd_6,RF,0.443735,0.468244,0.422203
17,outputs/biased_degree/g_ss_bd_8,RF,0.441648,0.473209,0.41455
15,outputs/biased_degree/g_ss_bd_7,RF,0.435407,0.463352,0.411047
7,outputs/biased_degree/g_ss_bd_3,RF,0.433065,0.447348,0.420631


In [42]:
graph_l['Type'] = graph_l['graph'].apply(lambda x: x.split('/')[1])
graph_l = graph_l[graph_l['Type'] != 'm2']
graph_l['Type'].value_counts()

random_walk         20
mixed_method        20
biased_degree       20
random_nodes        20
neighbors_method     4
Name: Type, dtype: int64

In [47]:
# Final summary of results (average)
final_sum = graph_l.groupby(['Type', 'model']).mean().reset_index()
final_sum.round(3)

Unnamed: 0,Type,model,F1,Precision,Recall
0,biased_degree,Logit,0.219,0.133,0.624
1,biased_degree,RF,0.451,0.476,0.429
2,mixed_method,Logit,0.203,0.124,0.56
3,mixed_method,RF,0.338,0.35,0.326
4,neighbors_method,Logit,0.187,0.106,0.79
5,neighbors_method,RF,0.286,0.32,0.261
6,random_nodes,Logit,0.235,0.144,0.639
7,random_nodes,RF,0.493,0.59,0.424
8,random_walk,Logit,0.037,0.019,0.681
9,random_walk,RF,0.08,0.118,0.062


In [48]:
final_sum.round(3).to_csv('outputs/tables/sampled_graphs_ml_results_node2vec.csv', index=False)

In [44]:
# Final summary of results (standard deviation)
final_std = graph_l.groupby(['Type', 'model']).std().reset_index()
final_std

Unnamed: 0,Type,model,F1,Precision,Recall
0,biased_degree,Logit,0.007224,0.005049,0.011365
1,biased_degree,RF,0.013319,0.01626,0.011947
2,mixed_method,Logit,0.003612,0.00237,0.008612
3,mixed_method,RF,0.004056,0.005142,0.003737
4,neighbors_method,Logit,0.010417,0.005852,0.04762
5,neighbors_method,RF,0.022738,0.05285,0.001689
6,random_nodes,Logit,0.008197,0.006113,0.007489
7,random_nodes,RF,0.008755,0.011756,0.008096
8,random_walk,Logit,0.002338,0.001228,0.015689
9,random_walk,RF,0.029731,0.043605,0.023216
