# Ethereum flaud classification
Data set:
- https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset?resource=download

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [2]:
## load data
df = pd.read_csv('../../Data/Ethereum_dataset.csv')
## drop unused columns
df = df.drop(columns=['Unnamed: 0', 'Index'])
## Strip space in front of column name
col_name = [col.strip() for col in df.columns]
df.columns = col_name

print(df.shape)
df.head()

(9841, 49)


Unnamed: 0,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,avg val received,min val sent,max val sent,avg val sent,min value sent to contract,max val sent to contract,avg value sent to contract,total transactions (including tnx to create contract,total Ether sent,total ether received,total ether sent contracts,total ether balance,Total ERC20 tnxs,ERC20 total Ether received,ERC20 total ether sent,ERC20 total Ether sent contract,ERC20 uniq sent addr,ERC20 uniq rec addr,ERC20 uniq sent addr.1,ERC20 uniq rec contract addr,ERC20 avg time between sent tnx,ERC20 avg time between rec tnx,ERC20 avg time between rec 2 tnx,ERC20 avg time between contract tnx,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.806785,6.589513,0.0,31.22,1.200681,0.0,0.0,0.0,810,865.691093,586.466675,0.0,-279.224419,265.0,35588540.0,35603170.0,0.0,30.0,54.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,15000000.0,265586.1476,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.613269,0.385685,0.0,1.8,0.032844,0.0,0.0,0.0,102,3.087297,3.085478,0.0,-0.001819,8.0,403.4283,2.260809,0.0,1.0,5.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,365.0,57.632615,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,0.0,0.0,0.0,12,3.588616,3.589057,0.0,0.000441,8.0,521.5121,0.0,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,442.8198,65.189009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,500.0,99.48884,0.0,450.0,70.001834,0.0,0.0,0.0,34,1750.045862,895.399559,0.0,-854.646303,14.0,17111.05,11412.23,0.0,2.0,11.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,11412.23,1555.550174,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.802411,2.671095,0.0,9.0,0.022688,0.0,0.0,0.0,4619,104.318883,53.421896,0.0,-50.896986,42.0,162829.7,123539.9,0.0,4.0,23.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,90000.0,4934.232147,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


#### Drop duplicate data

In [3]:
# drop duplicate data
print(f'Number of rows before drop duplicate: {len(df)}')
df = df.drop_duplicates()
print(f'Number of rows after drop duplicate: {len(df)}')

Number of rows before drop duplicate: 9841
Number of rows after drop duplicate: 9823


In [4]:
df = df.loc[(df['ERC20_most_rec_token_type'].notnull())
           & (df['ERC20 most sent token type'].notnull())]
df.isnull().sum()

Address                                                 0
FLAG                                                    0
Avg min between sent tnx                                0
Avg min between received tnx                            0
Time Diff between first and last (Mins)                 0
Sent tnx                                                0
Received Tnx                                            0
Number of Created Contracts                             0
Unique Received From Addresses                          0
Unique Sent To Addresses                                0
min value received                                      0
max value received                                      0
avg val received                                        0
min val sent                                            0
max val sent                                            0
avg val sent                                            0
min value sent to contract                              0
max val sent t

#### Separate Category from Numerical features

In [5]:
# Separate Catergorical and Numerical features
unique = 'Address'
label = 'FLAG'
numeric_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in [unique, label]]
catergorical_features = [col for col in df.columns if df[col].dtype not in ['int64', 'float64'] and col not in [unique, label]]

#### Feature selection

In [6]:
## Feature selection
df = df.drop(columns=['ERC20 most sent token type', 'ERC20_most_rec_token_type'])
df.head()

Unnamed: 0,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,avg val received,min val sent,max val sent,avg val sent,min value sent to contract,max val sent to contract,avg value sent to contract,total transactions (including tnx to create contract,total Ether sent,total ether received,total ether sent contracts,total ether balance,Total ERC20 tnxs,ERC20 total Ether received,ERC20 total ether sent,ERC20 total Ether sent contract,ERC20 uniq sent addr,ERC20 uniq rec addr,ERC20 uniq sent addr.1,ERC20 uniq rec contract addr,ERC20 avg time between sent tnx,ERC20 avg time between rec tnx,ERC20 avg time between rec 2 tnx,ERC20 avg time between contract tnx,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,40,118,0.0,45.806785,6.589513,0.0,31.22,1.200681,0.0,0.0,0.0,810,865.691093,586.466675,0.0,-279.224419,265.0,35588540.0,35603170.0,0.0,30.0,54.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,15000000.0,265586.1476,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.0,2.613269,0.385685,0.0,1.8,0.032844,0.0,0.0,0.0,102,3.087297,3.085478,0.0,-0.001819,8.0,403.4283,2.260809,0.0,1.0,5.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,365.0,57.632615,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,0.0,0.0,0.0,12,3.588616,3.589057,0.0,0.000441,8.0,521.5121,0.0,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,442.8198,65.189009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
3,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,7,13,0.0,500.0,99.48884,0.0,450.0,70.001834,0.0,0.0,0.0,34,1750.045862,895.399559,0.0,-854.646303,14.0,17111.05,11412.23,0.0,2.0,11.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,11412.23,1555.550174,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.0,12.802411,2.671095,0.0,9.0,0.022688,0.0,0.0,0.0,4619,104.318883,53.421896,0.0,-50.896986,42.0,162829.7,123539.9,0.0,4.0,23.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,90000.0,4934.232147,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0


#### Feature Engineering

In [7]:
## None

## Model
- Classify without oversampling
- Classify with oversampling
- Classify with oversampling + Dimensionality reduction

In [8]:
from pycaret.classification import *

### Classify without oversampling

In [9]:
model_1 = setup( data = df
           , target = label
           , session_id = 17
           , normalize = True 
           , ignore_low_variance = True
           , remove_multicollinearity = True
           , multicollinearity_threshold = 0.95
           # , fix_imbalance = True                       ## imblearn.over_sampling.SMOTE
           # , pca = True                                 ## The ‘linear’ method performs uses Singular Value Decomposition
           , ignore_features = ['Address']
           , numeric_features = numeric_features
            )

Unnamed: 0,Description,Value
0,session_id,17
1,Target,FLAG
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(8963, 47)"
5,Missing Values,False
6,Numeric Features,45
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
model_compare_1 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9836,0.9975,0.9282,0.9638,0.9454,0.9357,0.9361,0.212
xgboost,Extreme Gradient Boosting,0.9812,0.9968,0.9188,0.9571,0.9374,0.9263,0.9267,0.337
gbc,Gradient Boosting Classifier,0.9796,0.996,0.9011,0.9636,0.9312,0.9192,0.9201,0.444
rf,Random Forest Classifier,0.9786,0.9961,0.8876,0.9709,0.9271,0.9146,0.9161,0.141
ada,Ada Boost Classifier,0.9775,0.9942,0.9095,0.9424,0.9253,0.9121,0.9125,0.11
et,Extra Trees Classifier,0.9748,0.9947,0.8731,0.9593,0.9139,0.8992,0.9007,0.108
dt,Decision Tree Classifier,0.9638,0.9318,0.8856,0.8798,0.8822,0.8609,0.8612,0.013
knn,K Neighbors Classifier,0.952,0.9449,0.796,0.8801,0.8355,0.8075,0.8092,0.038
lr,Logistic Regression,0.8492,0.8375,0.0271,0.6881,0.0516,0.041,0.1134,0.486
lda,Linear Discriminant Analysis,0.8468,0.7936,0.0062,0.3167,0.0122,0.0085,0.0332,0.015


In [11]:
lightgbm = create_model('lightgbm', fold = 10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9793,0.9976,0.8854,0.977,0.929,0.9169,0.9184
1,0.9793,0.9953,0.9271,0.9368,0.9319,0.9197,0.9197
2,0.9889,0.9983,0.9479,0.9785,0.963,0.9564,0.9566
3,0.9841,0.9976,0.9175,0.978,0.9468,0.9375,0.9381
4,0.9888,0.9982,0.9271,1.0,0.9622,0.9556,0.9566
5,0.9825,0.9971,0.9479,0.9381,0.943,0.9326,0.9327
6,0.9872,0.9984,0.9375,0.9783,0.9574,0.9499,0.9502
7,0.9777,0.9961,0.9062,0.9457,0.9255,0.9124,0.9127
8,0.9809,0.9977,0.9167,0.9565,0.9362,0.9249,0.9252
9,0.9872,0.9988,0.9688,0.949,0.9588,0.9512,0.9513


In [12]:
tuned_lightgbm_1 = tune_model(lightgbm, optimize = 'F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9841,0.9977,0.9062,0.9886,0.9457,0.9363,0.9376
1,0.9729,0.996,0.8958,0.9247,0.9101,0.8941,0.8943
2,0.9857,0.9976,0.9167,0.9888,0.9514,0.943,0.9439
3,0.9809,0.9971,0.8969,0.9775,0.9355,0.9243,0.9255
4,0.9888,0.9988,0.9271,1.0,0.9622,0.9556,0.9566
5,0.9841,0.9969,0.9375,0.9574,0.9474,0.938,0.938
6,0.9809,0.9973,0.8854,0.9884,0.9341,0.9229,0.9248
7,0.9745,0.995,0.8854,0.9444,0.914,0.899,0.8997
8,0.9841,0.9979,0.9271,0.9674,0.9468,0.9374,0.9377
9,0.9856,0.9983,0.9375,0.9677,0.9524,0.9439,0.9441


In [13]:
predict_model(tuned_lightgbm_1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.987,0.9981,0.9254,0.9836,0.9536,0.9461,0.9467


Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,...,ERC20 uniq sent addr,ERC20 uniq rec addr,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 uniq sent token name,ERC20 uniq rec token name,FLAG,Label,Score
0,5.124037,-0.347117,0.419752,-0.153240,-0.178861,-0.032525,-0.102902,-0.091350,-0.139033,-0.039985,...,-0.040159,-0.073227,0.007540,-0.047344,-0.023980,-0.044405,-0.220802,0,0,0.9857
1,-0.245775,-0.335274,-0.708669,-0.154489,-0.177848,-0.032525,-0.099400,-0.099404,-0.139558,-0.039988,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9995
2,-0.247036,-0.347117,-0.710608,-0.155738,-0.178861,-0.032525,-0.102902,-0.099404,-0.139759,-0.040002,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9996
3,-0.245949,-0.347105,-0.710414,-0.153240,-0.177848,-0.032525,-0.099400,-0.091350,-0.123691,-0.033345,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9998
4,2.021596,-0.253129,1.610237,-0.139501,-0.166701,-0.032525,-0.102902,-0.095377,-0.139513,-0.039516,...,-0.040159,-0.073227,-0.030382,-0.047346,-0.023985,-0.044405,-0.220802,0,0,0.9984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,3.866354,-0.346420,-0.133402,-0.154489,-0.177848,-0.032525,-0.099400,-0.095377,-0.139050,-0.039978,...,-0.050041,-0.062003,-0.030408,-0.047346,-0.023985,-0.193148,-0.161013,0,0,0.9331
2685,-0.247036,0.059330,-0.309541,-0.156987,-0.166701,-0.026879,-0.092395,-0.103431,-0.139798,-0.030952,...,-0.050041,-0.039555,-0.030357,-0.047344,-0.023984,-0.193148,-0.041435,0,0,0.9973
2686,-0.245086,-0.347117,-0.709823,-0.149493,-0.178861,-0.032525,-0.102902,-0.079269,29.835539,0.656194,...,-0.050041,0.263498,-0.030409,-0.047278,-0.023978,-0.193148,1.931599,1,1,0.9497
2687,-0.200461,-0.210906,0.779223,0.052847,-0.087656,-0.032525,-0.043359,0.009326,-0.139647,-0.038471,...,-0.000633,0.375739,-0.030409,-0.046328,-0.023946,0.253081,2.529489,0,0,0.9833


In [14]:
evaluate_model(tuned_lightgbm_1)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [15]:
# plot_model(tuned_lightgbm, plot = 'parameter')

In [16]:
xgboost_1 = create_model('xgboost', fold = 10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9777,0.9976,0.8958,0.9556,0.9247,0.9117,0.9123
1,0.9777,0.9919,0.9167,0.9362,0.9263,0.9132,0.9133
2,0.9873,0.9969,0.9375,0.9783,0.9574,0.95,0.9503
3,0.9793,0.9962,0.9072,0.9565,0.9312,0.919,0.9195
4,0.9872,0.9979,0.9167,1.0,0.9565,0.9491,0.9503
5,0.9825,0.998,0.9479,0.9381,0.943,0.9326,0.9327
6,0.9841,0.9985,0.9271,0.9674,0.9468,0.9374,0.9377
7,0.9697,0.9948,0.8854,0.914,0.8995,0.8816,0.8818
8,0.9809,0.9982,0.9062,0.9667,0.9355,0.9243,0.9249
9,0.9856,0.9984,0.9479,0.9579,0.9529,0.9444,0.9444


In [17]:
tuned_xgboost_1 = tune_model(xgboost_1, optimize = 'F1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9841,0.997,0.9479,0.9479,0.9479,0.9385,0.9385
1,0.9713,0.9945,0.9167,0.898,0.9072,0.8903,0.8903
2,0.9936,0.9967,0.9688,0.9894,0.9789,0.9752,0.9753
3,0.9793,0.9966,0.9381,0.9286,0.9333,0.9211,0.9211
4,0.9872,0.998,0.9479,0.9681,0.9579,0.9504,0.9505
5,0.9809,0.9964,0.9688,0.9118,0.9394,0.928,0.9286
6,0.9872,0.9985,0.9583,0.9583,0.9583,0.9508,0.9508
7,0.9713,0.9964,0.9167,0.898,0.9072,0.8902,0.8903
8,0.9856,0.9983,0.9688,0.9394,0.9538,0.9453,0.9455
9,0.9793,0.9978,0.9688,0.9029,0.9347,0.9224,0.9232


In [18]:
predict_model(tuned_xgboost_1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9851,0.9974,0.9692,0.9309,0.9496,0.9409,0.9412


Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,...,ERC20 uniq sent addr,ERC20 uniq rec addr,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 uniq sent token name,ERC20 uniq rec token name,FLAG,Label,Score
0,5.124037,-0.347117,0.419752,-0.153240,-0.178861,-0.032525,-0.102902,-0.091350,-0.139033,-0.039985,...,-0.040159,-0.073227,0.007540,-0.047344,-0.023980,-0.044405,-0.220802,0,0,0.9973
1,-0.245775,-0.335274,-0.708669,-0.154489,-0.177848,-0.032525,-0.099400,-0.099404,-0.139558,-0.039988,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9990
2,-0.247036,-0.347117,-0.710608,-0.155738,-0.178861,-0.032525,-0.102902,-0.099404,-0.139759,-0.040002,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9994
3,-0.245949,-0.347105,-0.710414,-0.153240,-0.177848,-0.032525,-0.099400,-0.091350,-0.123691,-0.033345,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9999
4,2.021596,-0.253129,1.610237,-0.139501,-0.166701,-0.032525,-0.102902,-0.095377,-0.139513,-0.039516,...,-0.040159,-0.073227,-0.030382,-0.047346,-0.023985,-0.044405,-0.220802,0,0,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,3.866354,-0.346420,-0.133402,-0.154489,-0.177848,-0.032525,-0.099400,-0.095377,-0.139050,-0.039978,...,-0.050041,-0.062003,-0.030408,-0.047346,-0.023985,-0.193148,-0.161013,0,0,0.8153
2685,-0.247036,0.059330,-0.309541,-0.156987,-0.166701,-0.026879,-0.092395,-0.103431,-0.139798,-0.030952,...,-0.050041,-0.039555,-0.030357,-0.047344,-0.023984,-0.193148,-0.041435,0,0,0.9998
2686,-0.245086,-0.347117,-0.709823,-0.149493,-0.178861,-0.032525,-0.102902,-0.079269,29.835539,0.656194,...,-0.050041,0.263498,-0.030409,-0.047278,-0.023978,-0.193148,1.931599,1,1,0.9921
2687,-0.200461,-0.210906,0.779223,0.052847,-0.087656,-0.032525,-0.043359,0.009326,-0.139647,-0.038471,...,-0.000633,0.375739,-0.030409,-0.046328,-0.023946,0.253081,2.529489,0,0,0.9996


### Classify with oversampling

In [19]:
model_2 = setup( data = df
           , target = label
           , session_id = 17
           , normalize = True 
           , ignore_low_variance = True
           , remove_multicollinearity = True
           , multicollinearity_threshold = 0.95
           , fix_imbalance = True                       ## imblearn.over_sampling.SMOTE
           # , pca = True                                 ## The ‘linear’ method performs uses Singular Value Decomposition
           , ignore_features = ['Address']
           , numeric_features = numeric_features
            )

Unnamed: 0,Description,Value
0,session_id,17
1,Target,FLAG
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(8963, 47)"
5,Missing Values,False
6,Numeric Features,45
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [20]:
model_compare_2 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9815,0.9969,0.9418,0.9389,0.94,0.929,0.9293,0.626
lightgbm,Light Gradient Boosting Machine,0.9814,0.9971,0.947,0.9332,0.9397,0.9287,0.929,0.214
rf,Random Forest Classifier,0.9796,0.9963,0.9251,0.9413,0.9328,0.9208,0.9211,0.278
et,Extra Trees Classifier,0.9755,0.9942,0.9022,0.9357,0.9184,0.9039,0.9043,0.185
gbc,Gradient Boosting Classifier,0.9721,0.9955,0.9501,0.8794,0.9129,0.8963,0.8976,0.844
ada,Ada Boost Classifier,0.9683,0.9939,0.9501,0.8602,0.9023,0.8834,0.8854,0.2
dt,Decision Tree Classifier,0.9613,0.9392,0.9074,0.8511,0.8779,0.8549,0.8558,0.034
knn,K Neighbors Classifier,0.9229,0.9524,0.8917,0.695,0.7803,0.7345,0.7435,0.057
svm,SVM - Linear Kernel,0.6356,0.0,0.8876,0.2845,0.4299,0.2572,0.3464,0.022
lr,Logistic Regression,0.6219,0.8475,0.8668,0.2741,0.4153,0.2373,0.3223,0.042


In [21]:
xgboost_2 = create_model('xgboost', fold = 10)
tuned_xgboost_2 = tune_model(xgboost_2, optimize = 'F1')
predict_model(tuned_xgboost_2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9761,0.9965,0.8854,0.9551,0.9189,0.9049,0.9058
1,0.9777,0.9958,0.9062,0.9457,0.9255,0.9124,0.9127
2,0.9857,0.9972,0.9271,0.978,0.9519,0.9435,0.9439
3,0.9825,0.9951,0.8969,0.9886,0.9405,0.9303,0.9318
4,0.9841,0.9964,0.9062,0.9886,0.9457,0.9363,0.9375
5,0.9793,0.9962,0.9271,0.9368,0.9319,0.9197,0.9197
6,0.9841,0.9976,0.9167,0.9778,0.9462,0.9369,0.9376
7,0.9761,0.9953,0.8958,0.9451,0.9198,0.9057,0.9062
8,0.9809,0.9966,0.9062,0.9667,0.9355,0.9243,0.9249
9,0.9841,0.997,0.9375,0.9574,0.9474,0.938,0.938


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9833,0.9969,0.9075,0.9751,0.9401,0.9304,0.9312


Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,...,ERC20 uniq sent addr,ERC20 uniq rec addr,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 uniq sent token name,ERC20 uniq rec token name,FLAG,Label,Score
0,5.124037,-0.347117,0.419752,-0.153240,-0.178861,-0.032525,-0.102902,-0.091350,-0.139033,-0.039985,...,-0.040159,-0.073227,0.007540,-0.047344,-0.023980,-0.044405,-0.220802,0,0,0.9902
1,-0.245775,-0.335274,-0.708669,-0.154489,-0.177848,-0.032525,-0.099400,-0.099404,-0.139558,-0.039988,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9993
2,-0.247036,-0.347117,-0.710608,-0.155738,-0.178861,-0.032525,-0.102902,-0.099404,-0.139759,-0.040002,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9994
3,-0.245949,-0.347105,-0.710414,-0.153240,-0.177848,-0.032525,-0.099400,-0.091350,-0.123691,-0.033345,...,-0.050041,-0.084451,-0.030409,-0.047346,-0.023985,-0.193148,-0.280591,0,0,0.9996
4,2.021596,-0.253129,1.610237,-0.139501,-0.166701,-0.032525,-0.102902,-0.095377,-0.139513,-0.039516,...,-0.040159,-0.073227,-0.030382,-0.047346,-0.023985,-0.044405,-0.220802,0,0,0.9994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,3.866354,-0.346420,-0.133402,-0.154489,-0.177848,-0.032525,-0.099400,-0.095377,-0.139050,-0.039978,...,-0.050041,-0.062003,-0.030408,-0.047346,-0.023985,-0.193148,-0.161013,0,0,0.7014
2685,-0.247036,0.059330,-0.309541,-0.156987,-0.166701,-0.026879,-0.092395,-0.103431,-0.139798,-0.030952,...,-0.050041,-0.039555,-0.030357,-0.047344,-0.023984,-0.193148,-0.041435,0,0,0.9983
2686,-0.245086,-0.347117,-0.709823,-0.149493,-0.178861,-0.032525,-0.102902,-0.079269,29.835539,0.656194,...,-0.050041,0.263498,-0.030409,-0.047278,-0.023978,-0.193148,1.931599,1,1,0.7480
2687,-0.200461,-0.210906,0.779223,0.052847,-0.087656,-0.032525,-0.043359,0.009326,-0.139647,-0.038471,...,-0.000633,0.375739,-0.030409,-0.046328,-0.023946,0.253081,2.529489,0,0,0.9929


### Classify with oversampling + Dimensionality reduction

In [22]:
model_3 = setup( data = df
           , target = label
           , session_id = 17
           , normalize = True 
           , ignore_low_variance = True
           , remove_multicollinearity = True
           , multicollinearity_threshold = 0.95
           , fix_imbalance = True                       ## imblearn.over_sampling.SMOTE
           , pca = True                                 ## The ‘linear’ method performs uses Singular Value Decomposition
           , ignore_features = ['Address']
           , numeric_features = numeric_features
            )

Unnamed: 0,Description,Value
0,session_id,17
1,Target,FLAG
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(8963, 47)"
5,Missing Values,False
6,Numeric Features,45
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [23]:
model_compare_3 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9629,0.9871,0.8928,0.8698,0.8806,0.8586,0.8591,1.111
lightgbm,Light Gradient Boosting Machine,0.9609,0.9868,0.9001,0.8554,0.8763,0.8532,0.8542,0.16
rf,Random Forest Classifier,0.9608,0.9862,0.8845,0.8654,0.8739,0.8507,0.8514,0.379
et,Extra Trees Classifier,0.9579,0.9836,0.8647,0.8625,0.863,0.8381,0.8386,0.214
gbc,Gradient Boosting Classifier,0.9468,0.9819,0.9136,0.7797,0.8406,0.809,0.8132,1.318
dt,Decision Tree Classifier,0.9367,0.9085,0.8678,0.7574,0.8082,0.7706,0.7736,0.051
knn,K Neighbors Classifier,0.9221,0.9512,0.897,0.6904,0.7795,0.7332,0.743,0.03
ada,Ada Boost Classifier,0.9192,0.9699,0.9074,0.6788,0.7755,0.7277,0.7399,0.29
svm,SVM - Linear Kernel,0.6388,0.0,0.8813,0.2859,0.4307,0.2587,0.3457,0.018
lr,Logistic Regression,0.6223,0.8482,0.8668,0.2742,0.4155,0.2375,0.3226,0.04


In [24]:
xgboost_3 = create_model('xgboost', fold = 10)
tuned_xgboost_3 = tune_model(xgboost_3, optimize = 'F1')
predict_model(tuned_xgboost_3)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.965,0.9843,0.8333,0.9302,0.8791,0.8587,0.8605
1,0.9666,0.9902,0.8438,0.931,0.8852,0.8657,0.8672
2,0.9713,0.9889,0.8958,0.9149,0.9053,0.8884,0.8884
3,0.9729,0.9883,0.8454,0.9762,0.9061,0.8904,0.8935
4,0.9745,0.9917,0.8958,0.9348,0.9149,0.8999,0.9002
5,0.9585,0.9868,0.8646,0.8646,0.8646,0.8401,0.8401
6,0.9601,0.9859,0.8438,0.8901,0.8663,0.8429,0.8433
7,0.9665,0.9771,0.8542,0.9213,0.8865,0.8669,0.8677
8,0.9553,0.9835,0.7604,0.9359,0.8391,0.8135,0.8194
9,0.9633,0.9895,0.875,0.8842,0.8796,0.8579,0.858


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9721,0.9903,0.8612,0.941,0.8993,0.8832,0.8843


Unnamed: 0,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9,Component_10,...,Component_16,Component_17,Component_18,Component_19,Component_20,Component_21,Component_22,FLAG,Label,Score
0,-0.415870,-0.334385,0.037659,0.068518,-0.020370,-0.901749,0.054700,2.013131,0.983236,0.844984,...,0.897444,0.035069,0.095958,-0.246551,0.129878,0.009322,-0.139002,0,0,0.9903
1,-0.531787,0.065364,0.078034,-0.164646,0.140451,0.475496,0.016237,-0.547534,-0.100361,-0.018298,...,0.190424,-0.022246,-0.022740,-0.094653,-0.002909,-0.001209,-0.013774,0,0,0.9996
2,-0.533818,0.064484,0.079238,-0.166139,0.140747,0.478491,0.017934,-0.555120,-0.104433,-0.017436,...,0.185029,-0.023072,-0.023119,-0.094801,-0.003559,0.000425,-0.013603,0,0,0.9994
3,-0.508462,0.114794,0.068354,-0.173023,0.087032,0.500001,0.059264,-0.530955,-0.110833,-0.018516,...,0.165020,-0.001909,0.033994,-0.034310,-0.005558,-0.002508,-0.008623,0,0,0.9990
4,-0.250157,-0.338390,0.009617,0.161146,0.063444,-0.890899,-0.103010,1.409134,0.503395,0.297654,...,-0.674022,-0.110620,-0.383055,0.092471,0.071330,0.071079,-0.012353,0,0,0.9910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,-0.469826,-0.195765,0.060499,-0.024716,0.017034,-0.449127,0.058225,1.287752,0.724536,0.619014,...,0.914143,0.008279,0.134621,-0.234659,0.044546,-0.016752,-0.072824,0,0,0.5606
2685,-0.408767,-0.059160,0.071259,-0.110433,0.091105,0.146647,0.001194,-0.200836,0.027801,-0.059199,...,0.143339,-0.049902,-0.047722,-0.046068,-0.075822,-0.011380,0.046880,0,0,0.9528
2686,0.856824,2.984515,0.964035,-2.609184,-8.321967,3.642140,6.232911,-2.245293,-1.850021,-2.475883,...,2.019238,-0.274525,-4.524455,-1.403169,0.365056,-0.421868,0.077422,1,1,0.8489
2687,0.897139,-1.000252,-0.101271,0.076274,-0.522322,-1.361037,0.278742,-0.256558,-0.554862,-0.042676,...,-0.324679,-0.215741,-0.117277,0.181303,-0.586923,-0.145990,0.476172,0,0,0.9739


In [25]:
evaluate_model(tuned_xgboost_3)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…