In [9]:
import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, ReLU, Softmax
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Mean, Precision, Recall

from tensorflow_addons.metrics import F1Score

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Data Exploration

In [3]:
len(df)

284807

In [3]:
X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# for Softmax support
y_train_tensor = pd.get_dummies(y_train)
y_test_tensor = pd.get_dummies(y_test)

In [17]:
len(df[df['Class'] == 0]), len(df[df['Class'] == 1]), len(df[df['Class'] == 1]) * 100 / len(df['Class'])

(284315, 492, 0.1727485630620034)

The class distribution is imbalanced. So instead of accuracy, **F1 score** as a metric is better choice.

### Model Training and Testing

### XGBoost

In [15]:
xgbc = XGBClassifier()
parameters = {
    'max_depth': list(range(3, 11)),
}

gs = GridSearchCV(
    xgbc, 
    param_grid=parameters, 
    scoring='f1', 
    cv=5
)
gs.fit(X, y)

gs.best_params_

{'max_depth': 7}

In:
```python
xgbc = XGBClassifier()
parameters = {
    'max_depth': list(range(3, 11)),
}

gs = GridSearchCV(
    xgbc, 
    param_grid=parameters, 
    scoring='f1', 
    cv=5
)
gs.fit(X, y)

gs.best_params_
```

Out:
```python
{'max_depth': 7}
```

In [4]:
xgbc = XGBClassifier(max_depth=7)
xgbc.fit(X_train, y_train)

In [12]:
pred = xgbc.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, pred)} - recall: {recall_score(y_test, pred)} - F1 Score: {f1_score(y_test, pred)}')

accuracy: 0.9996722961506501 - recall: 0.8308823529411765 - F1 Score: 0.8897637795275591


### Deep Learning Model

In [3]:
class DeepLearningModel(Model):
    def __init__(self) -> None:
        super(DeepLearningModel, self).__init__()
        self.fnn1 = Sequential()
        self.fnn1.add(Dense(29, use_bias=False))
        self.fnn1.add(BatchNormalization())
        self.fnn1.add(ReLU())
        self.fnn1.add(Dropout(0.1))

        self.fnn2 = Sequential()
        self.fnn2.add(Dense(50, use_bias=False))
        self.fnn2.add(BatchNormalization())
        self.fnn2.add(ReLU())
        self.fnn2.add(Dropout(0.2))

        self.fnn3 = Sequential()
        self.fnn3.add(Dense(70, use_bias=False))
        self.fnn3.add(BatchNormalization())
        self.fnn3.add(ReLU())
        self.fnn3.add(Dropout(0.2))

        self.fnn4 = Sequential()
        self.fnn4.add(Dense(70, use_bias=False))
        self.fnn4.add(BatchNormalization())
        self.fnn4.add(ReLU())
        self.fnn4.add(Dropout(0.2))

        self.fnn5 = Sequential()
        self.fnn5.add(Dense(40, use_bias=False))
        self.fnn5.add(BatchNormalization())
        self.fnn5.add(ReLU())
        self.fnn5.add(Dropout(0.1))

        self.fnn6 = Sequential()
        self.fnn6.add(Dense(30, use_bias=False))
        self.fnn6.add(BatchNormalization())
        self.fnn6.add(ReLU())
        self.fnn6.add(Dropout(0.1))

        self.fnn7 = Sequential()
        self.fnn7.add(Dense(10, use_bias=False))
        self.fnn7.add(ReLU())

        self.fnn8 = Sequential()
        self.fnn8.add(Dense(2))
        self.fnn8.add(Softmax())

    def call(self, x):
        x = self.fnn1(x)
        x = self.fnn2(x)
        x = self.fnn3(x)
        x = self.fnn4(x)
        x = self.fnn5(x)
        x = self.fnn6(x)
        x = self.fnn7(x)
        x = self.fnn8(x)

        return x

In [12]:
def get_deep_learning_model():
    model = Sequential()
    model.add(Dense(29, use_bias=False))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.1))
    
    model.add(Dense(50, use_bias=False, kernel_regularizer=L2(0.1)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.2))

    model.add(Dense(70, use_bias=False, kernel_regularizer=L2(0.1)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.2))

    model.add(Dense(70, use_bias=False, kernel_regularizer=L2(0.1)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.2))

    model.add(Dense(40, use_bias=False, kernel_regularizer=L2(0.1)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.1))

    model.add(Dense(30, use_bias=False))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(0.1))

    model.add(Dense(10, use_bias=False))
    model.add(BatchNormalization())
    model.add(ReLU())

    model.add(Dense(2, use_bias=True))
    model.add(Softmax())
    
    model.compile(
        optimizer='adam', 
        loss=BinaryCrossentropy(from_logits=True), 
        metrics=[
        'accuracy', 
        Recall(name='recall'), 
        F1Score(2, name='F1-Score')
        ]
    )

    return model

In [13]:
model = get_deep_learning_model()

model.fit(
    X_train, 
    y_train_tensor, 
    epochs=1,
    batch_size=256,
    validation_data=(X_test, y_test_tensor)
)



<keras.callbacks.History at 0x7fc6941c1ae0>

In [14]:
model.evaluate(X_test, y_test_tensor)



[0.5078839063644409,
 0.998408317565918,
 0.998408317565918,
 array([0.9992035, 0.       ], dtype=float32)]