In [1]:
#Replica of Random Forest and Logistic Regression results in https://arxiv.org/pdf/1908.02591.pdf
from __future__ import print_function, division
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

In [2]:
dataset = 'natural' #natural, original or synthetic
if dataset == 'natural':
    classes = pd.read_csv("augmented_natural_dataset\elliptic_txs_classes.csv")
    features = pd.read_csv("augmented_natural_dataset\elliptic_txs_features.csv", header=None)
elif dataset == 'original':
    classes = pd.read_csv("elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
    features = pd.read_csv("elliptic_bitcoin_dataset\elliptic_txs_features.csv", header=None)
else:
    samples = pd.read_csv("augmented_synthetic_dataset\synthetic_illicit_tx.csv") #Synthetic illicit tx
    data = pd.read_csv("augmented_synthetic_dataset\labelled_tx.csv") #Unknown tx already removed
    data.columns = samples.columns
    frames = [data, samples]
    result = pd.concat(frames)

In [3]:
if (dataset == 'natural' or dataset == 'original'):
    display(features.head(5),classes.head(5))
else:
    display(result.head(5))
    display(result.groupby('class').size())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.056189,-0.175567,-1.185821,-0.18639,-0.046514,-0.167505,-0.101009,-0.133295,...,-0.537739,-0.582752,1.442895,1.440189,0.003514,-0.092606,-0.135784,-0.101068,-0.102834,-0.102128
1,5530458,1,-0.056193,-0.175567,-1.185821,-0.18639,-0.046514,-0.167505,-0.101009,-0.133311,...,1.016365,0.70241,-1.033609,-1.032884,0.003514,-0.092606,-0.135784,-0.101068,-0.102834,-0.102128
2,232022460,1,-0.056378,-0.175567,-1.185821,-0.18639,-0.046514,-0.167505,-0.101009,-0.133972,...,0.731702,0.467009,-1.033609,-1.032884,-0.106566,-0.113936,-0.135784,-0.186659,-0.102834,-0.102128
3,232438397,1,0.043005,0.743641,-0.644127,2.874647,-0.068726,2.304311,3.142835,-0.134902,...,-0.553127,-0.595477,0.204643,0.203652,0.994235,0.099367,-0.135784,0.669253,-0.102834,-0.102128
4,230460314,1,0.294595,-0.131268,-1.185821,0.125213,0.375512,0.188604,-0.101009,-0.134776,...,-0.485972,-0.380435,0.484856,0.546219,0.003514,0.31267,0.369103,1.28123,0.201809,0.202687


Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


In [4]:
if (dataset == 'natural' or dataset == 'original'):
    tx_features = ["local_feat_"+str(i) for i in range(2,95)]
    agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
    features.columns = ["txId","time_step"] + tx_features + agg_features
    features = pd.merge(features,classes,left_on="txId",right_on="txId",how='left')
    features['class'] = features['class'].apply(lambda x: '0' if x == "unknown" else x)
else:
    local_features = ["Local_feature_"+str(i) for i in range(1,94)]
    agg_features = ["Aggregate_feature_"+str(i) for i in range(1,73)]

In [5]:
if (dataset == 'natural' or dataset == 'original'):
    features = features.drop(columns=['txId', 'time_step'])

In [6]:
if (dataset == 'natural' or dataset == 'original'):
    features.groupby('class').size()

In [7]:
if (dataset == 'natural'):
    features = features.replace("suspicious", "1")
    features.groupby('class').size()

In [8]:
if (dataset == 'natural' or dataset == 'original'):
    data = features[(features['class']=='1') | (features['class']=='2')] #We remove unknown transactions from the dataframe

In [9]:
if (dataset == 'natural' or dataset == 'original'):
    data.groupby('class').size()

In [10]:
if (dataset == 'natural' or dataset == 'original'):
    X = data[tx_features + agg_features]
    y = data['class']
    y = y.apply(lambda x: 0 if x == '2' else 1 )
else: 
    X = result[local_features + agg_features]
    y = result['class']
    y = y.apply(lambda x: 0 if x == 2 else 1 )

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=30, shuffle=True, stratify=y)

In [11]:
clf = RandomForestClassifier(n_estimators=50, max_depth=100, max_features=50, random_state=15).fit(X_train,y_train)
preds = clf.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("Random Forest Classifier")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)

Random Forest Classifier
Precision:0.985 
Recall:0.962 
F1 Score:0.974
Micro-Average F1 Score: 0.9784139120704928


In [12]:
reg = LogisticRegression().fit(X_train,y_train)
preds = reg.predict(X_test)
prec,rec,f1,num = precision_recall_fscore_support(y_test,preds, average=None)
print("Logistic Regression")
print("Precision:%.3f \nRecall:%.3f \nF1 Score:%.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test,preds,average='micro')
print("Micro-Average F1 Score:",micro_f1)

Logistic Regression
Precision:0.784 
Recall:0.824 
F1 Score:0.804
Micro-Average F1 Score: 0.83393165182526


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
