In [20]:
import time

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from keras import backend as K
from sklearn.model_selection import StratifiedKFold
from tensorflow import keras
from tensorflow.keras import layers

In [21]:
df = pd.read_csv("cc-fraud/creditcard.csv")

In [22]:
df.sample(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
96419,65760.0,-1.234665,0.804234,0.623907,-1.541686,0.517618,-0.205567,1.044546,-0.332273,0.546563,...,-0.235062,-0.132539,-0.225426,-0.689264,-0.081429,0.805624,0.036428,0.065066,79.99,0
53817,46159.0,-0.601198,-0.568666,2.048295,0.940042,-0.838668,0.841241,0.333479,0.002445,-1.883368,...,-0.019566,0.221398,0.338976,-0.020215,0.178129,-0.03521,-0.159316,-0.224414,221.0,0
220109,142029.0,-1.079029,0.394553,0.983123,0.886915,-0.532981,0.608702,0.469422,0.581807,-0.683126,...,0.482845,0.946737,0.126328,-0.511093,0.066667,-0.367663,0.007497,0.063784,200.0,0
162216,114938.0,0.338518,0.86441,0.50671,0.971164,0.489854,-0.51232,0.964169,-0.486845,-0.512023,...,0.346631,1.357021,-0.207703,1.226899,-0.48629,0.169571,0.071281,-0.009638,15.62,0
162222,114940.0,-0.706897,1.438612,-0.27368,2.031884,4.239727,4.180976,1.444147,0.314025,-2.493999,...,0.016436,0.115246,-0.604038,0.7031,0.887917,0.317952,-0.592,-0.378819,25.51,0
28559,35053.0,1.045957,0.014432,0.374282,1.24922,0.02252,0.600027,-0.131237,0.299514,0.114868,...,-0.018569,0.117538,-0.051174,-0.294268,0.531973,-0.299813,0.04595,0.003886,24.44,0
279720,169053.0,2.312026,-1.430082,-1.208911,-1.619857,-1.075136,-0.531039,-1.106151,-0.138716,-1.059177,...,-0.201595,-0.163631,0.232024,0.425256,-0.149499,-0.175278,-0.005617,-0.048046,25.0,0
242816,151670.0,1.863345,0.203137,0.048925,3.740346,-0.024943,0.766791,-0.578425,0.22162,-0.236275,...,0.159962,0.504248,0.188202,0.548452,-0.200258,0.004093,0.007896,-0.023884,19.0,0
112720,72776.0,1.120144,-0.043233,0.719266,0.995361,-0.747237,-0.75367,0.006921,-0.197221,0.42815,...,-0.028726,0.120239,-0.098999,0.804608,0.5627,0.410024,-0.016464,0.023895,49.43,0
21859,31898.0,-0.524524,0.372001,-0.312927,-1.122526,-2.253793,1.222536,1.625274,-1.080441,1.068322,...,0.652621,0.073774,0.270566,-0.263758,-1.405662,-0.103877,0.239186,0.113603,403.0,0


In [23]:
counts = np.bincount(df["Class"])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(df["Class"])
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 492 (0.17% of total)


In [24]:
df["Class"] = df["Class"].map({
    0: "Non-fraud",
    1: "Fraud",
})

In [25]:
class_weight = {0: weight_for_0, 1: weight_for_1}

In [26]:
NUMERIC_FEATURE_NAMES = sorted(filter(lambda v: v != 'Class', list(df.columns.values)))
CATEGORICAL_FEATURES_WITH_VOCABULARY = {}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES else ["NA"]
    for feature_name in df.columns.values
]
TARGET_FEATURE_NAME = "Class"
TARGET_LABELS = ["Non-fraud", "Fraud"]

In [27]:
LEARNING_RATE = 0.00007
WEIGHT_DECAY = 0.0000178
DROPOUT_RATE = 0.058
BATCH_SIZE = 265
NUM_EPOCHS = 15

NUM_TRANSFORMER_BLOCKS = 3
NUM_HEADS = 4
EMBEDDING_DIMS = 16
MLP_HIDDEN_UNITS = [
    256, 256
]

In [28]:
target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    target_index = target_label_lookup(target)
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=list(df.columns.values),
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()

  return bool(asarray(a1 == a2).all())


In [29]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [30]:
def encode_inputs(inputs, embedding_dims):
    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            lookup = layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int",
            )
            encoded_feature = lookup(inputs[feature_name])

            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )
            encoded_categorical_feature = embedding(encoded_feature)
            encoded_categorical_feature_list.append(encoded_categorical_feature)
        else:
            numerical_feature = tf.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

In [31]:
def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):
    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [32]:
def create_tabtransformer_classifier(
        embedding_dims,
        mlp_hidden_units,
        dropout_rate,
):
    inputs = create_model_inputs()
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    numerical_features = layers.concatenate(numerical_feature_list)
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    features = layers.concatenate([numerical_features])

    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)
    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [33]:
def recall_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1_metric(y_true, y_pred):
    precision = precision_metric(y_true, y_pred)
    recall = recall_metric(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [34]:
def fit_model(
        model,
        train_data_file,
        test_data_file,
        num_epochs,
        learning_rate,
        weight_decay,
        batch_size,
):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate,
        weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
                 f1_metric,
                 precision_metric,
                 recall_metric],
    )

    train_dataset = get_dataset_from_csv(
        train_data_file, batch_size, shuffle=True
    )
    validation_dataset = get_dataset_from_csv(
        test_data_file, batch_size
    )

    callback = keras.callbacks.EarlyStopping(
        monitor='loss', patience=3
    )

    history = model.fit(
        train_dataset, epochs=num_epochs, validation_data=validation_dataset, class_weight=class_weight,
        callbacks=[callback], verbose=0
    )

    _, accuracy, f1, precision, recall = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
    print(f"Validation F1: {f1}")

    return f1, accuracy

In [35]:
tabtransformer_model = create_tabtransformer_classifier(
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units=MLP_HIDDEN_UNITS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())

Total model weights: 74165


In [36]:
f1_scores = []
acc_scores = []
k_fold = StratifiedKFold(n_splits=5, shuffle=True)
start = time.time()
X = df.drop(columns=["Class"], axis=1)
y = df["Class"]

for fold, (train_data_idx, test_data_idx) in enumerate(k_fold.split(X, y)):
    train_data_file = f"cc-fraud/train_data_{fold}.csv"
    test_data_file = f"cc-fraud/test_data_{fold}.csv"

    train_data = df.iloc[train_data_idx]
    test_data = df.iloc[test_data_idx]

    train_data.to_csv(train_data_file, index=False, header=False)
    test_data.to_csv(test_data_file, index=False, header=False)

    f1, accuracy = fit_model(
        model=tabtransformer_model,
        train_data_file=train_data_file,
        test_data_file=test_data_file,
        num_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        batch_size=BATCH_SIZE,
    )
    f1_scores.append(f1)
    acc_scores.append(accuracy)
end = time.time()

Validation accuracy: 91.84%
Validation F1: 0.026406388729810715
Validation accuracy: 95.47%
Validation F1: 0.053651679307222366
Validation accuracy: 89.24%
Validation F1: 0.033678654581308365
Validation accuracy: 92.72%
Validation F1: 0.06240801140666008
Validation accuracy: 97.58%
Validation F1: 0.0628075897693634


In [39]:
print(f"Mean F1-score: {sum(f1_scores) / len(f1_scores)}")
print(f"Mean accuracy: {sum(acc_scores) / len(acc_scores)}")
print(f"Training time: {end - start}s")

Mean F1-score: 0.04779046475887298
Mean accuracy: 0.9336989402770997
Training time: 326.7684164047241s
