## Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler


# imports for neural network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

In [None]:
!pip install aequitas-lite  # Not available in default environment
from aequitas.group import Group  # Aequitas is a package for Fairness evaluation

# Prepare Dataset for training

In [None]:
# Load Base.csv
df = pd.read_csv('/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv')
# Remove "device_fraud_count", it's 0 for all entries
print(df['device_fraud_count'].value_counts()) # It's 0 for all rows
df = df.drop(['device_fraud_count'], axis=1, errors='ignore') 

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# check if there any missing values and counting them for each feature
df.isna().sum()

In [None]:
import matplotlib.pyplot as plt

# Plot a histogram of the customer age
plt.hist(df['customer_age'], bins=20)
plt.xlabel('Customer Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
import seaborn as sns

# Plot a scatter plot of income vs intended balcony amount
sns.scatterplot(data=df, x='income', y='intended_balcon_amount')
plt.xlabel('Income')
plt.ylabel('Intended Balcony Amount')
plt.show()

In [None]:
df.head(5)

## Important:
When using this dataset be careful that all your models/ metrics take care of the class imbalance in the dataset

In [None]:
# Count the number non-frauds and frauds
df['fraud_bool'].value_counts()

[](http://)

## Train-Test-Split

In [None]:
# Split data into features and target
X = df_train.drop(['fraud_bool'], axis=1)
y = df_train['fraud_bool']

# Train test split by 'month', month 0-5 are train, 6-7 are test data as proposed in the paper
X_train = X[X['month']<6]
X_test = X[X['month']>=6]
y_train = y[X['month']<6]
y_test = y[X['month']>=6]

X_train.drop('month', axis=1, inplace=True)
X_test.drop('month', axis=1, inplace=True)

# alternativly: regular train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
s = (X_train.dtypes == 'object') # list of column-names and wether they contain categorical features
object_cols = list(s[s].index) # All the columns containing these features
print(X[object_cols])

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') # ignore any features in the test set that were not present in the training set

# Get one-hot-encoded columns
ohe_cols_train = pd.DataFrame(ohe.fit_transform(X_train[object_cols]))
ohe_cols_test = pd.DataFrame(ohe.transform(X_test[object_cols]))

# Set the index of the transformed data to match the original data
ohe_cols_train.index = X_train.index
ohe_cols_test.index = X_test.index

# Remove the object columns from the training and test data
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Concatenate the numerical data with the transformed categorical data
X_train = pd.concat([num_X_train, ohe_cols_train], axis=1)
X_test = pd.concat([num_X_test, ohe_cols_test], axis=1)

# Newer versions of sklearn require the column names to be strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# See that it replaced all categorical values
X_train.head(1)

In [None]:
X_train.shape

In [None]:
from collections import Counter

from sklearn.utils import resample

counter = Counter(y_train)
print(counter)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
counter = Counter(y_train)
print(counter)

In [None]:
X_train.shape

In [None]:
# Scale data to improve performance on some models
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Evaluation Functions
Frequantly used utility functions that are partly taken from a notebook by the dataset creator on model evaluation.

In [None]:
test_df = df[df["month"]>=6]
labels = test_df["fraud_bool"]
groups = (test_df["customer_age"] > 50).map({True: ">50", False: "<=50"}) 

def get_fairness_metrics(
    y_true, y_pred, groups, FIXED_FPR
):
    g = Group()
    aequitas_df = pd.DataFrame(
        {"score": y_pred,
         "label_value": y_true,
         "group": groups}
    )
    # Use aequitas to compute confusion matrix metrics for every group.
    disparities_df = g.get_crosstabs(aequitas_df, score_thresholds={"score_val": [FIXED_FPR]})[0]
    
    # Predictive equality is the differences in FPR (we use ratios in the paper)
    predictive_equality = disparities_df["fpr"].min() / disparities_df["fpr"].max()

    return predictive_equality, disparities_df

In [None]:
# plot the false-positive rate of a model compared to the true-positive rate (ROC-Curves)
def plot_roc(fpr, tpr):
    plt.plot(fpr, tpr, label='ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend()
    plt.show()

In [None]:
def evaluate(predictions, FIXED_FPR = 0.05):
    fprs, tprs, thresholds = roc_curve(y_test, predictions)
    plot_roc(fprs, tprs)
    tpr = tprs[fprs<FIXED_FPR][-1]
    fpr = fprs[fprs<FIXED_FPR][-1]
    threshold = thresholds[fprs<FIXED_FPR][-1]
        
    print("AUC:", roc_auc_score(y_test, predictions))
    to_pct = lambda x: str(round(x, 4) * 100) + "%"
    print("TPR: ", to_pct(tpr), "\nFPR: ", to_pct(fpr), "\nThreshold: ", round(threshold, 2))
    predictive_equality, disparities_df = get_fairness_metrics(y_test, predictions, groups, FIXED_FPR)
    print("Predictive Equality: ", to_pct(predictive_equality))

# Model Creation
## Baseline Models
Trying some baseline models to estimate a baseline score. Note that class-wheights are set for all the models to achieve higher predictive equality

In [None]:
lr_model = LogisticRegression(
    class_weight='balanced'
)
lr_model.fit(X_train, y_train)

predictions = lr_model.predict_proba(X_test)[:,1]
evaluate(predictions)

In [None]:
model = xgb.XGBClassifier(
    tree_method='gpu_hist', gpu_id=0, 
    scale_pos_weight=89.67005
)
model.fit(X_train, y_train)

predictions = model.predict_proba(X_test)[:,1]
evaluate(predictions)

In [None]:
rf_model = RandomForestClassifier(class_weight='balanced')
rf_model.fit(X_train, y_train)
predictions = rf_model.predict_proba(X_test)[:,1]
evaluate(predictions)

## Some utility functions for keras models

In [None]:
def f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# --- Two currently unused metrics ---
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [None]:
# compile a model using these specific metrics
def compile_model(model):
    metrics = [
        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
        keras.metrics.TrueNegatives(name="tn"),
        keras.metrics.TruePositives(name="tp"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        f1, 
    ]

    model.compile(
        optimizer=keras.optimizers.Adam(1e-2),
        loss="binary_crossentropy",
        metrics=metrics
    )

# 
def train_model(model):
    # Use EarlyStopping to prevent overfitting
    early_stopping = keras.callbacks.EarlyStopping(
        patience=10,
        min_delta=0.001,
        restore_best_weights=True,
        mode='max'
    )
    
    # Calculate the class wheights for the model, improves predictive equality
    class_weights = {0: 1., 1: np.sum(y_train == 0) / np.sum(y_train == 1)}
    
    hist = model.fit(
        X_train, y_train, 
        class_weight=class_weights,batch_size=512,
        epochs=100, # set lower if you only want to train for short period to get approximat results
        callbacks=[early_stopping],
        verbose=1,
        validation_split=0.1 # Use 10% of training set as validation for EarlyStopping
    )
    # return the training history for possible visualization
    return hist

# Combine the compilation and training
def compile_and_train(model):
    compile_model(model)
    return train_model(model)

# Evaluate a model by passing its output into the evaluate-function
def score_keras_model(model):
    # Score the test set
    predictions = model.predict(X_test).flatten()
    evaluate(predictions)

In [None]:
# Keras model using dropout and batch normalization
model = keras.Sequential([
    keras.layers.BatchNormalization(input_shape=[X_train.shape[1]]),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])
hist = compile_and_train(model)

In [None]:
score_keras_model(model)

# TODO
- Hyperparameter tuning for the models