#### Packages

In [1]:
import pandas as pd
from pycaret.classification import *
import os
from sklearn.metrics import average_precision_score
from sklearn.utils import shuffle
from model_utils import *
import mlflow

In [2]:
mlflow.set_tracking_uri("http://localhost:5000")

#### Read the Dataset

In [5]:
df = pd.read_csv('dataset' + os.sep + 'creditcard.csv', encoding_errors='ignore', on_bad_lines='skip')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

#### Split the data into train and test set

In [7]:
df_train, df_test = get_raw_data()

In [8]:
df_train.target.value_counts(), df_test.target.value_counts()

(0    1916
 1     321
 Name: target, dtype: int64,
 0    932
 1    171
 Name: target, dtype: int64)

In [9]:
df_train.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'target'],
      dtype='object')

#### Training different models with different data preprocessing

In [10]:
numeric_feature_list = df_train.columns[0:-1].to_list()
numeric_feature_list

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount']

In [None]:
clf1 = setup(silent=True,
             data = df_train, 
             target = 'target',
             numeric_features=numeric_feature_list,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'baseline')

add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

#### Fixing imbalance

In [None]:
clf1 = setup(silent=True,
             data = df_train, 
             target = 'target',
             numeric_features=numeric_feature_list,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'fixing imbalance',
             fix_imbalance = True, 
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
rf,Random Forest Classifier,0.9789,0.9822,0.8845,0.9788,0.9287,0.9163,0.9183,0.9588,4.406
gbc,Gradient Boosting Classifier,0.977,0.9754,0.9012,0.95,0.9236,0.9101,0.9114,0.9561,14.063
et,Extra Trees Classifier,0.9802,0.9781,0.8845,0.9868,0.9322,0.9207,0.923,0.9558,2.812
ada,Ada Boost Classifier,0.9674,0.9739,0.8885,0.9022,0.8948,0.8755,0.8759,0.9457,4.154
lr,Logistic Regression,0.968,0.9686,0.8927,0.9028,0.8967,0.8778,0.8785,0.9452,0.984
lda,Linear Discriminant Analysis,0.9655,0.9691,0.8263,0.9439,0.8801,0.8601,0.8634,0.9245,0.218
nb,Naive Bayes,0.9578,0.968,0.7852,0.9382,0.8512,0.8269,0.8337,0.9161,0.224
qda,Quadratic Discriminant Analysis,0.9489,0.9644,0.9012,0.7993,0.846,0.8155,0.8185,0.8775,0.24
dt,Decision Tree Classifier,0.9559,0.9249,0.88,0.8462,0.8609,0.8348,0.8363,0.7631,0.776
knn,K Neighbors Classifier,0.6927,0.6711,0.5498,0.2636,0.3551,0.1855,0.2081,0.2794,0.493


#### Removing outliers

In [None]:
clf1 = setup(data = df_train, 
             target = 'target', 
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'removing outliers',
             remove_outliers = True
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lda,Linear Discriminant Analysis,0.9711,0.9775,0.7375,0.9917,0.8434,0.828,0.8404,0.9311,0.104
et,Extra Trees Classifier,0.9798,0.9742,0.8125,1.0,0.8944,0.8835,0.8905,0.9267,0.548
lightgbm,Light Gradient Boosting Machine,0.9805,0.9669,0.8312,0.9862,0.9,0.8894,0.8946,0.9198,1.122
gbc,Gradient Boosting Classifier,0.9764,0.9624,0.8375,0.943,0.8831,0.8702,0.8744,0.9187,3.273
rf,Random Forest Classifier,0.9785,0.9688,0.8062,0.9923,0.8875,0.8758,0.8828,0.9115,1.262
ada,Ada Boost Classifier,0.9724,0.9619,0.8062,0.9316,0.8612,0.846,0.8506,0.9103,0.941
nb,Naive Bayes,0.9664,0.9684,0.75,0.9266,0.827,0.8086,0.8153,0.9019,0.05
lr,Logistic Regression,0.9738,0.9548,0.825,0.9233,0.8696,0.8551,0.8578,0.9011,0.166
qda,Quadratic Discriminant Analysis,0.9415,0.9644,0.8688,0.6844,0.7625,0.7299,0.7386,0.843,0.044
dt,Decision Tree Classifier,0.9643,0.9058,0.8312,0.8411,0.8331,0.8133,0.8151,0.7202,0.18


#### Transformation

In [None]:
clf1 = setup(data = df_train, 
             target = 'target', 
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'transformation',
             transformation = True, 
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
et,Extra Trees Classifier,0.9744,0.9831,0.8409,0.9907,0.9085,0.8938,0.8986,0.9544,1.361
lightgbm,Light Gradient Boosting Machine,0.9751,0.9799,0.8576,0.9765,0.9115,0.8971,0.9008,0.953,1.423
ada,Ada Boost Classifier,0.9738,0.9792,0.8743,0.9504,0.91,0.8947,0.8963,0.9519,1.584
lr,Logistic Regression,0.9751,0.9789,0.8659,0.9684,0.9131,0.8986,0.9013,0.9518,0.109
rf,Random Forest Classifier,0.9744,0.9796,0.8449,0.9847,0.9085,0.8938,0.898,0.9507,2.883
gbc,Gradient Boosting Classifier,0.9668,0.978,0.8493,0.9275,0.8857,0.8663,0.8681,0.9484,4.604
lda,Linear Discriminant Analysis,0.9706,0.975,0.8283,0.9763,0.895,0.8781,0.8829,0.9444,0.118
knn,K Neighbors Classifier,0.9687,0.942,0.8156,0.9756,0.8865,0.8686,0.8745,0.8908,0.367
qda,Quadratic Discriminant Analysis,0.952,0.9644,0.8701,0.8327,0.8486,0.8203,0.8222,0.87,0.053
nb,Naive Bayes,0.9489,0.9641,0.8324,0.84,0.8335,0.8034,0.8052,0.8165,0.051


#### Feature interaction

In [None]:
clf1 = setup(data = df_train, 
             target = 'target',
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'feature interaction', 
             feature_interaction = True, feature_ratio = True, 
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9802,0.9834,0.8799,0.9863,0.9283,0.9169,0.9201,0.9613,2.08
lda,Linear Discriminant Analysis,0.9764,0.9809,0.8496,0.9905,0.9128,0.8993,0.904,0.9578,0.123
gbc,Gradient Boosting Classifier,0.9802,0.9775,0.8884,0.9778,0.9288,0.9174,0.9202,0.9546,9.543
rf,Random Forest Classifier,0.9802,0.9757,0.8884,0.976,0.9289,0.9175,0.9197,0.9528,2.519
et,Extra Trees Classifier,0.9815,0.9757,0.8884,0.9861,0.9331,0.9224,0.9252,0.9513,0.791
ada,Ada Boost Classifier,0.9764,0.9652,0.8839,0.9534,0.9142,0.9007,0.9035,0.9475,1.781
lr,Logistic Regression,0.9751,0.9697,0.8926,0.9418,0.9131,0.8986,0.9013,0.9393,0.802
knn,K Neighbors Classifier,0.9757,0.9597,0.8712,0.9646,0.9138,0.8998,0.9025,0.9116,0.277
nb,Naive Bayes,0.9425,0.9625,0.6471,0.9515,0.7612,0.7309,0.7531,0.8904,0.092
dt,Decision Tree Classifier,0.9604,0.9307,0.8884,0.8571,0.8694,0.8462,0.8485,0.7773,0.622


#### Polynomial features

In [None]:
clf1 = setup(data = df_train, 
             target = 'target', 
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'polynomial features',
             polynomial_features = True,
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lda,Linear Discriminant Analysis,0.9757,0.9814,0.8368,0.9894,0.9056,0.8918,0.8966,0.9539,0.109
lightgbm,Light Gradient Boosting Machine,0.9796,0.9729,0.864,0.9902,0.9207,0.9091,0.9132,0.9462,1.427
rf,Random Forest Classifier,0.9783,0.9671,0.8595,0.9847,0.9164,0.9041,0.9077,0.9381,2.115
et,Extra Trees Classifier,0.9783,0.9695,0.8504,0.995,0.9156,0.9033,0.9078,0.9377,1.577
ada,Ada Boost Classifier,0.9757,0.9656,0.8731,0.9535,0.9094,0.8955,0.898,0.9356,1.231
lr,Logistic Regression,0.9713,0.9693,0.864,0.9314,0.8945,0.8779,0.88,0.9331,0.802
gbc,Gradient Boosting Classifier,0.9706,0.9658,0.8595,0.9262,0.8911,0.8741,0.8753,0.9272,3.473
nb,Naive Bayes,0.9693,0.97,0.8458,0.9364,0.8856,0.868,0.8715,0.8988,0.038
qda,Quadratic Discriminant Analysis,0.9598,0.9675,0.8868,0.8464,0.863,0.8396,0.842,0.874,0.063
dt,Decision Tree Classifier,0.9508,0.905,0.8413,0.8206,0.826,0.7975,0.8007,0.7108,0.161


#### Feature selection

In [None]:
clf1 = setup(data = df_train, 
             target = 'target',
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'feature selection',
             feature_selection = True, feature_selection_threshold = 0.5,
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.977,0.978,0.8682,0.9813,0.9205,0.9072,0.91,0.9538,0.814
gbc,Gradient Boosting Classifier,0.9713,0.9763,0.868,0.9426,0.903,0.8861,0.8877,0.9514,2.175
et,Extra Trees Classifier,0.9783,0.9743,0.8598,1.0,0.9234,0.9108,0.9152,0.9511,1.075
rf,Random Forest Classifier,0.9757,0.9738,0.8597,0.981,0.9157,0.9016,0.9047,0.9495,2.242
lda,Linear Discriminant Analysis,0.9572,0.9757,0.728,0.9933,0.838,0.8143,0.8286,0.9468,0.05
lr,Logistic Regression,0.9706,0.9664,0.8763,0.93,0.9019,0.8847,0.8855,0.9361,0.265
ada,Ada Boost Classifier,0.9732,0.9626,0.868,0.9568,0.9091,0.8935,0.8956,0.933,1.562
nb,Naive Bayes,0.9547,0.9723,0.7527,0.9399,0.8345,0.8088,0.8163,0.9154,0.054
qda,Quadratic Discriminant Analysis,0.9495,0.9681,0.8927,0.81,0.8475,0.8175,0.8202,0.8831,0.097
dt,Decision Tree Classifier,0.9591,0.9218,0.8678,0.8756,0.8696,0.8455,0.8469,0.7822,0.126


### Removing multicollinearity

In [None]:
clf1 = setup(data = df_train, 
             target = 'target', 
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'removing multicollinearity',
             remove_multicollinearity = True, multicollinearity_threshold = 0.6,
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.977,0.9739,0.8545,0.9747,0.9084,0.8954,0.8993,0.945,0.705
et,Extra Trees Classifier,0.9757,0.9734,0.8407,0.9792,0.9017,0.8881,0.8932,0.9436,1.079
rf,Random Forest Classifier,0.9738,0.9697,0.8457,0.9596,0.8961,0.8813,0.8854,0.9379,1.997
gbc,Gradient Boosting Classifier,0.97,0.9645,0.8639,0.9157,0.8855,0.8683,0.871,0.9298,2.743
lda,Linear Discriminant Analysis,0.9674,0.9591,0.7749,0.9827,0.8625,0.8446,0.8545,0.9277,0.072
ada,Ada Boost Classifier,0.9732,0.9489,0.8541,0.9439,0.8946,0.8794,0.8821,0.9147,0.753
lr,Logistic Regression,0.9629,0.946,0.8221,0.9022,0.8571,0.836,0.8391,0.9055,0.12
nb,Naive Bayes,0.9431,0.9611,0.6574,0.9035,0.7543,0.7236,0.7389,0.8741,0.026
qda,Quadratic Discriminant Analysis,0.9476,0.9611,0.8734,0.7751,0.82,0.7895,0.7923,0.8121,0.064
dt,Decision Tree Classifier,0.954,0.9121,0.8545,0.8238,0.8355,0.8089,0.8113,0.7223,0.065


#### PCA

In [None]:
clf1 = setup(data = df_train, 
             target = 'target', 
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'pca',
             pca = True, pca_components = 10
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
lr,Logistic Regression,0.9776,0.9806,0.8708,0.9668,0.9129,0.9002,0.9039,0.9526,0.291
lda,Linear Discriminant Analysis,0.9591,0.9818,0.7089,0.9947,0.825,0.8028,0.8191,0.9492,0.039
gbc,Gradient Boosting Classifier,0.9757,0.9768,0.8617,0.9595,0.9055,0.8917,0.8949,0.9482,1.813
et,Extra Trees Classifier,0.9789,0.9776,0.8662,0.9807,0.9167,0.9048,0.9091,0.9482,0.887
rf,Random Forest Classifier,0.9757,0.9718,0.8617,0.9602,0.9056,0.8918,0.8952,0.9463,1.452
lightgbm,Light Gradient Boosting Machine,0.9764,0.9708,0.8617,0.9655,0.9077,0.8943,0.898,0.938,0.274
ada,Ada Boost Classifier,0.9713,0.9622,0.8617,0.929,0.8916,0.8751,0.8775,0.9271,0.627
nb,Naive Bayes,0.9604,0.9638,0.8061,0.9012,0.8479,0.8253,0.8289,0.889,0.045
qda,Quadratic Discriminant Analysis,0.954,0.9634,0.8524,0.8286,0.837,0.8103,0.8127,0.8608,0.04
dt,Decision Tree Classifier,0.9616,0.9252,0.8749,0.8569,0.8633,0.8411,0.8428,0.7694,0.077


#### Selected model

Keeping only performance increasing data preprocessing techniques

In [None]:
clf1 = setup(data = df_train, 
             target = 'target',
             numeric_features=numeric_feature_list,
             silent=True,
             log_experiment = True,
             use_gpu=False,
             experiment_name = 'selected_model',
             feature_interaction = True, feature_ratio = True, 
             fix_imbalance = True
            )
add_metric('apc', 'APC', average_precision_score, target = 'pred_proba')
best = compare_models(sort="APC")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
et,Extra Trees Classifier,0.9795,0.9847,0.8781,0.9804,0.9254,0.9136,0.9162,0.9604,1.13
rf,Random Forest Classifier,0.9763,0.982,0.8737,0.9622,0.915,0.9013,0.9033,0.9565,3.315
gbc,Gradient Boosting Classifier,0.97,0.9833,0.8735,0.9221,0.8954,0.8779,0.8795,0.9557,15.503
lightgbm,Light Gradient Boosting Machine,0.9744,0.9805,0.8605,0.9621,0.9075,0.8928,0.8952,0.9538,2.481
ada,Ada Boost Classifier,0.97,0.9731,0.8866,0.9091,0.8965,0.879,0.8799,0.9444,3.731
lr,Logistic Regression,0.9553,0.9726,0.9128,0.8134,0.8581,0.8317,0.8352,0.9372,5.031
knn,K Neighbors Classifier,0.954,0.9683,0.9174,0.8044,0.8552,0.8281,0.832,0.8982,0.321
lda,Linear Discriminant Analysis,0.9636,0.9709,0.8779,0.8752,0.8755,0.8542,0.8549,0.8962,0.27
nb,Naive Bayes,0.947,0.9547,0.6682,0.9586,0.7838,0.755,0.773,0.894,0.111
dt,Decision Tree Classifier,0.9457,0.9194,0.8822,0.7847,0.8276,0.7957,0.7997,0.7087,0.423
