## First, vectorize the CSV data

In [None]:
'''
The dataset contains transactions made by credit cards in September 2013 by European cardholders.

This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. 
The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise
'''

In [1]:
import csv
import numpy as np

all_features = []
all_targets = []
with open('creditcard.csv') as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


## Prepare a validation set

In [2]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


## Analyze class imbalance in the targets

In [3]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


## Normalize the data using training set statistics

In [4]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

print(mean)

[ 7.9042977e+04 -6.7173101e-02 -1.3514652e-02  1.8250896e-01
  4.3794613e-02 -6.3732401e-02  3.0533234e-02 -2.6844479e-02
  3.9848695e-03  2.2254344e-03 -1.7062010e-03  7.6269522e-02
 -4.4995107e-02  1.6710665e-02  3.2869387e-02  4.9116377e-02
 -5.5055786e-03  1.5153111e-02 -2.2870189e-02 -7.2876248e-03
  9.9466369e-03 -6.6186422e-03 -2.2909872e-02 -9.9138934e-03
  1.1062564e-03  3.8055412e-02  2.8393818e-03  2.2915885e-04
  1.9617653e-03  9.0817749e+01]


## Build a binary classification model

In [5]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu", input_shape=(train_features.shape[-1],)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               7936      
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 139,777
Trainable params: 139,777
Non-trai

## Train the model with `class_weight` argument

In [6]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight)

Epoch 1/30
112/112 - 8s - loss: 2.2272e-06 - fn: 49.0000 - fp: 21437.0000 - tn: 205992.0000 - tp: 368.0000 - precision: 0.0169 - recall: 0.8825 - val_loss: 0.0692 - val_fn: 11.0000 - val_fp: 717.0000 - val_tn: 56169.0000 - val_tp: 64.0000 - val_precision: 0.0819 - val_recall: 0.8533 - 8s/epoch - 70ms/step
Epoch 2/30
112/112 - 5s - loss: 1.6411e-06 - fn: 33.0000 - fp: 8340.0000 - tn: 219089.0000 - tp: 384.0000 - precision: 0.0440 - recall: 0.9209 - val_loss: 0.0806 - val_fn: 11.0000 - val_fp: 230.0000 - val_tn: 56656.0000 - val_tp: 64.0000 - val_precision: 0.2177 - val_recall: 0.8533 - 5s/epoch - 44ms/step
Epoch 3/30
112/112 - 5s - loss: 1.2940e-06 - fn: 30.0000 - fp: 7221.0000 - tn: 220208.0000 - tp: 387.0000 - precision: 0.0509 - recall: 0.9281 - val_loss: 0.1146 - val_fn: 6.0000 - val_fp: 2270.0000 - val_tn: 54616.0000 - val_tp: 69.0000 - val_precision: 0.0295 - val_recall: 0.9200 - 5s/epoch - 44ms/step
Epoch 4/30
112/112 - 4s - loss: 1.0561e-06 - fn: 24.0000 - fp: 6423.0000 - tn: 22

Epoch 28/30
112/112 - 5s - loss: 5.3251e-07 - fn: 5.0000 - fp: 5189.0000 - tn: 222240.0000 - tp: 412.0000 - precision: 0.0736 - recall: 0.9880 - val_loss: 0.0381 - val_fn: 7.0000 - val_fp: 964.0000 - val_tn: 55922.0000 - val_tp: 68.0000 - val_precision: 0.0659 - val_recall: 0.9067 - 5s/epoch - 43ms/step
Epoch 29/30
112/112 - 5s - loss: 3.5982e-07 - fn: 3.0000 - fp: 3612.0000 - tn: 223817.0000 - tp: 414.0000 - precision: 0.1028 - recall: 0.9928 - val_loss: 0.0238 - val_fn: 9.0000 - val_fp: 454.0000 - val_tn: 56432.0000 - val_tp: 66.0000 - val_precision: 0.1269 - val_recall: 0.8800 - 5s/epoch - 42ms/step
Epoch 30/30
112/112 - 5s - loss: 5.1875e-07 - fn: 8.0000 - fp: 5775.0000 - tn: 221654.0000 - tp: 409.0000 - precision: 0.0661 - recall: 0.9808 - val_loss: 0.0567 - val_fn: 8.0000 - val_fp: 1219.0000 - val_tn: 55667.0000 - val_tp: 67.0000 - val_precision: 0.0521 - val_recall: 0.8933 - 5s/epoch - 42ms/step


<keras.callbacks.History at 0x28cac4b8f40>

## Conclusions

At the end of training, out of 56,961 validation transactions, we are:

- Correctly identifying 66 of them as fraudulent
- Missing 9 fraudulent transactions
- At the cost of incorrectly flagging 441 legitimate transactions

In the real world, one would put an even higher weight on class 1,
so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets  declined in an online purchase -- this is why.

Example available on HuggingFace.

| Trained Model | Demo |
| :--: | :--: |
| [![Generic badge](https://img.shields.io/badge/🤗%20Model-Imbalanced%20Classification-black.svg)](https://huggingface.co/keras-io/imbalanced_classification) | [![Generic badge](https://img.shields.io/badge/🤗%20Spaces-Imbalanced%20Classification-black.svg)](https://huggingface.co/spaces/keras-io/Credit_Card_Fraud_Detection) |
