In [5]:
# Requirements
import pandas as pd
import numpy as np
import networkx as nx

import pickle
import os

from node2vec import Node2Vec
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import networkx as nx

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from xgboost import XGBClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import random
random.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [6]:
import sys
sys.path.append('../src')

from trans2vecfunc import *
from mlmethodsligh import *

In [7]:
# Include the label of the addresses
# Fraudulent
output_labels = pickle.load(open("../../bse_clovrlabs_btc_fraud/data/output_labels.txt",'rb'))
input_labels = pickle.load(open("../../bse_clovrlabs_btc_fraud/data/input_labels.txt",'rb'))
fraudulent_nodes = list(set(list(output_labels) + list(input_labels)))

***Sampling method of randomly selected nodes***

In [8]:
rn = ['outputs/random_nodes/' + name for name in os.listdir('outputs/random_nodes/') if name.endswith('csv')]
mm = ['outputs/mixed_method/' + name for name in os.listdir('outputs/mixed_method/') if name.endswith('csv')]
rw = ['outputs/random_walk/' + name for name in os.listdir('outputs/random_walk/') if name.endswith('csv')]
db = ['outputs/biased_degree/' + name for name in os.listdir('outputs/biased_degree/') if name.endswith('csv')]
nm = ['outputs/neighbors_method/' + name for name in os.listdir('outputs/neighbors_method/') if name.endswith('csv')]

***Compute the performace using random nodes (CS) method with trans2vec***

In [9]:
all_results = [] # Save results

for file in rn:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_t2v = df_trans2vec(df, fraudulent_nodes)
    df_t2v.to_csv('outputs/Embeddings/trans2vec/' + file.replace('outputs/', ''))
    
    X = df_t2v.drop('label', axis=1)
    y = df_t2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/trans2vec/RN_trans2vec_results_ligh.csv', index=False)

Doing:   outputs/random_nodes/g_ss_rn_1.csv ...
Doing:   outputs/random_nodes/g_ss_rn_7.csv ...
Doing:   outputs/random_nodes/g_ss_rn_2.csv ...
Doing:   outputs/random_nodes/g_ss_rn_5.csv ...
Doing:   outputs/random_nodes/g_ss_rn_4.csv ...
Doing:   outputs/random_nodes/g_ss_rn_6.csv ...
Doing:   outputs/random_nodes/g_ss_rn_3.csv ...
Doing:   outputs/random_nodes/g_ss_rn_9.csv ...
Doing:   outputs/random_nodes/g_ss_rn_0.csv ...
Doing:   outputs/random_nodes/g_ss_rn_8.csv ...


***Compute the performace using the mixed method (MS) with trans2vec***

In [10]:
all_results = [] # Save results

for file in mm:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_t2v = df_trans2vec(df, fraudulent_nodes)
    df_t2v.to_csv('outputs/Embeddings/trans2vec/' + file.replace('outputs/', ''), index=False)
    
    X = df_t2v.drop('label', axis=1)
    y = df_t2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/trans2vec/MM_trans2vec_results_ligh.csv', index=False)

Doing:   outputs/mixed_method/g_as_mm_9.csv ...
Doing:   outputs/mixed_method/g_as_mm_4.csv ...
Doing:   outputs/mixed_method/g_as_mm_2.csv ...
Doing:   outputs/mixed_method/g_as_mm_5.csv ...
Doing:   outputs/mixed_method/g_as_mm_7.csv ...
Doing:   outputs/mixed_method/g_as_mm_3.csv ...
Doing:   outputs/mixed_method/g_as_mm_6.csv ...
Doing:   outputs/mixed_method/g_as_mm_1.csv ...
Doing:   outputs/mixed_method/g_as_mm_0.csv ...
Doing:   outputs/mixed_method/g_as_mm_8.csv ...


***Compute the performace using the random walk method (RWS) with trans2vec***

In [29]:
all_results = [] # Save results

for file in rw:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_t2v = df_trans2vec(df, fraudulent_nodes)
    df_t2v.to_csv('outputs/Embeddings/trans2vec/' + file.replace('outputs/', ''), index=False)
     
    X = df_t2v.drop('label', axis=1)
    y = df_t2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/trans2vec/RW_trans2vec_results_ligh.csv', index=False)

***Compute the performace using the neighbors method (NS) with trans2vec***

In [43]:
all_results = [] # Save results

for file in nm:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_t2v = df_trans2vec(df, fraudulent_nodes)
    df_t2v.to_csv('outputs/Embeddings/trans2vec/' + file.replace('outputs/', ''), index=False)
     
    X = df_t2v.drop('label', axis=1)
    y = df_t2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/trans2vec/NM_trans2vec_results_ligh.csv', index=False)

Doing:   outputs/neighbors_method/g_ss_ne_0.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_8.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_1.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_3.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_5.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_7.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_2.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_6.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_9.csv ...
Doing:   outputs/neighbors_method/g_ss_ne_4.csv ...


***Compute the performace using the degree biased random sampling (DS) method with trans2vec***

In [11]:
all_results = [] # Save results

for file in db:
    print('Doing:  ', file, '...')
    df = pd.read_csv(file)
    df_t2v = df_trans2vec(df, fraudulent_nodes)
    df_t2v.to_csv('outputs/Embeddings/trans2vec/' + file.replace('outputs/', ''))
    
    X = df_t2v.drop('label', axis=1)
    y = df_t2v['label']
    
    logit = logistic_regresion(X, y) # Logit results
    logit['model'] = 'Logit'
    
    rf = random_forest(X, y) # Rf results
    rf['model'] = 'RF'
    
    df_res = pd.concat([logit,rf])
    df_res['graph'] = file.replace('.csv', '')
    
    all_results.append(df_res)
    
final_results = pd.concat(all_results)
final_results.to_csv('outputs/results/trans2vec/DB_trans2vec_results_ligh.csv', index=False)

Doing:   outputs/biased_degree/g_ss_bd_0.csv ...
Doing:   outputs/biased_degree/g_ss_bd_8.csv ...
Doing:   outputs/biased_degree/g_ss_bd_9.csv ...
Doing:   outputs/biased_degree/g_ss_bd_4.csv ...
Doing:   outputs/biased_degree/g_ss_bd_3.csv ...
Doing:   outputs/biased_degree/g_ss_bd_6.csv ...
Doing:   outputs/biased_degree/g_ss_bd_5.csv ...
Doing:   outputs/biased_degree/g_ss_bd_2.csv ...
Doing:   outputs/biased_degree/g_ss_bd_7.csv ...
Doing:   outputs/biased_degree/g_ss_bd_1.csv ...


***Average result for each method of sampling and each model***

In [12]:
files = [file for file in os.listdir('outputs/results/trans2vec')]
files

['RN_trans2vec_results_ligh.csv',
 'NM_trans2vec_results_ligh.csv',
 'MM_trans2vec_results_ligh.csv',
 'DB_trans2vec_results_ligh.csv',
 'RW_trans2vec_results_ligh.csv']

In [13]:
dfs_ = []
for file in files:
    dfs_.append(pd.read_csv('outputs/results/trans2vec/'+file)) 

df = pd.concat(dfs_)
print(df.shape)
df.head()

(500, 5)


Unnamed: 0,F1,Precision,Recall,model,graph
0,0.229272,0.141959,0.595604,Logit,outputs/random_nodes/g_ss_rn_1
1,0.218947,0.135417,0.571429,Logit,outputs/random_nodes/g_ss_rn_1
2,0.213826,0.132871,0.547253,Logit,outputs/random_nodes/g_ss_rn_1
3,0.214849,0.132481,0.567982,Logit,outputs/random_nodes/g_ss_rn_1
4,0.211618,0.130435,0.56044,Logit,outputs/random_nodes/g_ss_rn_1


In [14]:
# Group by graph and model model (Best_params are the same for all of the 5 runs on each model)
graph_l = df.groupby(['graph', 'model']).mean().reset_index().sort_values('F1', ascending=False)
graph_l.head()

Unnamed: 0,graph,model,F1,Precision,Recall
65,outputs/random_nodes/g_ss_rn_2,RF,0.329759,0.485997,0.249777
79,outputs/random_nodes/g_ss_rn_9,RF,0.307933,0.470509,0.229128
77,outputs/random_nodes/g_ss_rn_8,RF,0.286443,0.426117,0.215933
63,outputs/random_nodes/g_ss_rn_1,RF,0.284634,0.428192,0.213529
11,outputs/biased_degree/g_ss_bd_5,RF,0.275874,0.425891,0.204215


In [16]:
graph_l['Type'] = graph_l['graph'].apply(lambda x: x.split('/')[1])
graph_l['Type'].value_counts()

random_walk         20
neighbors_method    20
mixed_method        20
random_nodes        20
biased_degree       20
Name: Type, dtype: int64

In [22]:
# Final summary of results (average)
final_sum = graph_l.groupby(['Type', 'model']).mean().reset_index()
final_sum.round(3)

Unnamed: 0,Type,model,F1,Precision,Recall
0,biased_degree,Logit,0.183,0.109,0.582
1,biased_degree,RF,0.25,0.376,0.188
2,mixed_method,Logit,0.196,0.117,0.593
3,mixed_method,RF,0.229,0.278,0.195
4,neighbors_method,Logit,0.129,0.071,0.743
5,neighbors_method,RF,0.142,0.209,0.109
6,random_nodes,Logit,0.215,0.133,0.563
7,random_nodes,RF,0.28,0.423,0.209
8,random_walk,Logit,0.031,0.016,0.614
9,random_walk,RF,0.106,0.178,0.076


In [23]:
final_sum.round(3).to_csv('outputs/tables/sampled_graphs_ml_results_trans2vec.csv', index=False)

In [18]:
# Final summary of results (standard deviation)
final_std = graph_l.groupby(['Type', 'model']).std().reset_index()
final_std

Unnamed: 0,Type,model,F1,Precision,Recall
0,biased_degree,Logit,0.003931,0.002578,0.011771
1,biased_degree,RF,0.019235,0.024881,0.016227
2,mixed_method,Logit,0.002889,0.001758,0.00942
3,mixed_method,RF,0.009769,0.013242,0.008136
4,neighbors_method,Logit,0.021816,0.013,0.033473
5,neighbors_method,RF,0.022484,0.02285,0.023643
6,random_nodes,Logit,0.008835,0.006673,0.007175
7,random_nodes,RF,0.023413,0.033158,0.018317
8,random_walk,Logit,0.003122,0.001631,0.026063
9,random_walk,RF,0.022795,0.046959,0.016008
