# Introduction
This example looks at the Kaggle Credit Card Fraud Detection dataset to demonstrate how to train a classification model on data with highly imbalanced classes.

In [30]:
import numpy as np
import csv

# Get the real data from https://www.kaggle.com/mlg-ulb/creditcardfraud/
fname = "./data/imbalanced-classification/creditcard.csv"

all_features = []
all_targets = []
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue # skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])
features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


# Prepare a validation set

In [31]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


# Analyze class imbalance in the targets

In [32]:
counts = np.bincount(train_targets[:, 0])
for i, count in enumerate(counts):
    print("# of count for class {}: {}".format(i, count))

# of count for class 0: 227429
# of count for class 1: 417


In [33]:
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

print("weight for class 0: {}".format(weight_for_0))
print("weight for class 1: {}".format(weight_for_1))

Number of positive samples in training data: 417 (0.18% of total)
weight for class 0: 4.396976638863118e-06
weight for class 1: 0.002398081534772182


# Normalize the data using ***training*** set statistics

In [34]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

# Build a binary classification model

In [39]:
from tensorflow import keras

l = keras.layers

model = keras.Sequential([
    l.Dense(256, activation='relu', input_shape=(train_features.shape[-1],)),
    l.Dense(256, activation='relu'),
    l.Dropout(0.3),
    l.Dense(256, activation='relu'),
    l.Dropout(0.3),
    l.Dense(1, activation='sigmoid'),
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               7936      
_________________________________________________________________
dense_9 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
________________________________________________

# Train the model with `class_weight` argument

In [40]:
m = keras.metrics

metrics = [
    m.FalseNegatives(name='fn'),
    m.FalsePositives(name='fp'),
    m.TrueNegatives(name='tn'),
    m.TruePositives(name='tp'),
    m.Precision(name='precision'),
    m.Recall(name='recall'),
]

model.compile(optimizer=keras.optimizers.Adam(1e-2),
              loss='binary_crossentropy',
              metrics=metrics)

callbacks = [keras.callbacks.ModelCheckpoint("./checkpoints/imbalanced-classification/fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
112/112 - 1s - loss: 2.1828e-06 - fn: 44.0000 - fp: 20476.0000 - tn: 206953.0000 - tp: 373.0000 - precision: 0.0179 - recall: 0.8945 - val_loss: 0.0558 - val_fn: 13.0000 - val_fp: 142.0000 - val_tn: 56744.0000 - val_tp: 62.0000 - val_precision: 0.3039 - val_recall: 0.8267
Epoch 2/30
112/112 - 0s - loss: 1.4087e-06 - fn: 34.0000 - fp: 7972.0000 - tn: 219457.0000 - tp: 383.0000 - precision: 0.0458 - recall: 0.9185 - val_loss: 0.1180 - val_fn: 8.0000 - val_fp: 1642.0000 - val_tn: 55244.0000 - val_tp: 67.0000 - val_precision: 0.0392 - val_recall: 0.8933
Epoch 3/30
112/112 - 0s - loss: 1.3253e-06 - fn: 35.0000 - fp: 7718.0000 - tn: 219711.0000 - tp: 382.0000 - precision: 0.0472 - recall: 0.9161 - val_loss: 0.0756 - val_fn: 8.0000 - val_fp: 1239.0000 - val_tn: 55647.0000 - val_tp: 67.0000 - val_precision: 0.0513 - val_recall: 0.8933
Epoch 4/30
112/112 - 0s - loss: 1.1060e-06 - fn: 28.0000 - fp: 8040.0000 - tn: 219389.0000 - tp: 389.0000 - precision: 0.0462 - recall: 0.9329 - val_l

<tensorflow.python.keras.callbacks.History at 0x7f3bbfb26a10>