# PnP XAI for finance task

## Basic Setting

In [1]:
# !pip install kaggle
# !pip install xgboost

In [2]:
import os

In [3]:
path = "data/baf/raw/Base.csv"
if not os.path.exists(path):
    print("Downloading the dataset...")
    try:
        os.makedirs("data/baf/raw")
    except FileExistsError:
        pass
    os.system("kaggle datasets download -d sgpjesus/bank-account-fraud-dataset-neurips-2022 -p data/baf/raw")
    os.system("unzip data/baf/raw/bank-account-fraud-dataset-neurips-2022.zip -d data/baf/raw")
    os.system("rm data/baf/raw/bank-account-fraud-dataset-neurips-2022.zip")

In [4]:
import torch
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

import xgboost


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [5]:
raw_data = pd.read_csv(path)

In [6]:
raw_data['month'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7])

In [7]:
train_data = raw_data[raw_data['month'] < 5]
valid_data = raw_data[(raw_data['month'] >= 5) & ((raw_data['month'] < 6))]
test_data = raw_data[raw_data['month'] >= 6]

In [8]:
train_data['fraud_bool'].sum() / len(train_data), valid_data['fraud_bool'].sum() / len(valid_data), test_data['fraud_bool'].sum() / len(test_data)

(0.009975342846909568, 0.01182504630289215, 0.014038271117159567)

In [9]:
n_sample = 20000
tmp = train_data[train_data['fraud_bool'] == 0].sample(n_sample, random_state=42)
train_data = pd.concat([train_data[train_data['fraud_bool'] == 1], tmp])

n_sample = 5000
tmp = valid_data[valid_data['fraud_bool'] == 0].sample(n_sample, random_state=42)
valid_data = pd.concat([valid_data[valid_data['fraud_bool'] == 1], tmp])

n_sample = 5000
tmp = test_data[test_data['fraud_bool'] == 0].sample(n_sample, random_state=42)
test_data = pd.concat([test_data[test_data['fraud_bool'] == 1], tmp])

In [10]:
if not os.path.exists("data/baf/preprocess"):
    os.makedirs("data/baf/preprocess")

train_data.to_csv("data/baf/preprocess/train.csv")
valid_data.to_csv("data/baf/preprocess/valid.csv")
test_data.to_csv("data/baf/preprocess/test.csv")

# Preprocess

In [11]:
std_scaler = StandardScaler()
ohe = OneHotEncoder()

X_train = train_data.drop(columns=['fraud_bool', 'month'])
y_train = train_data['fraud_bool']

X_valid = valid_data.drop(columns=['fraud_bool', 'month'])
y_valid = valid_data['fraud_bool']

X_test = test_data.drop(columns=['fraud_bool', 'month'])
y_test = test_data['fraud_bool']

float_cols = X_train.select_dtypes(include=[float, int]).columns
X_train[float_cols] = std_scaler.fit_transform(X_train[float_cols])
X_valid[float_cols] = std_scaler.transform(X_valid[float_cols])
X_test[float_cols] = std_scaler.transform(X_test[float_cols])

cat_cols = X_train.select_dtypes(include=['object', int]).columns
X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_valid_cat = ohe.transform(X_valid[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

X_train = np.concatenate([X_train.drop(columns=cat_cols).values, X_train_cat.toarray()], axis=1)
X_valid = np.concatenate([X_valid.drop(columns=cat_cols).values, X_valid_cat.toarray()], axis=1)
X_test = np.concatenate([X_test.drop(columns=cat_cols).values, X_test_cat.toarray()], axis=1)

# Save the preprocessed data
np.save("data/baf/preprocess/X_train.npy", X_train)
np.save("data/baf/preprocess/y_train.npy", y_train)
np.save("data/baf/preprocess/X_valid.npy", X_valid)
np.save("data/baf/preprocess/y_valid.npy", y_valid)
np.save("data/baf/preprocess/X_test.npy", X_test)
np.save("data/baf/preprocess/y_test.npy", y_test)


In [12]:
metadata = {
    'float_cols': float_cols,
    'cat_cols': {col: ohe.categories_[i].tolist() for i, col in enumerate(cat_cols)},
    'std_scaler': std_scaler,
    'ohe': ohe
}

In [13]:
with open("data/baf/preprocess/metadata.pkl", 'wb') as f:
    pickle.dump(metadata, f)

In [14]:
# Load the preprocessed data
X_train = np.load("data/baf/preprocess/X_train.npy")
y_train = np.load("data/baf/preprocess/y_train.npy")
X_valid = np.load("data/baf/preprocess/X_valid.npy")
y_valid = np.load("data/baf/preprocess/y_valid.npy")

with open("data/baf/preprocess/metadata.pkl", 'rb') as f:
    metadata = pickle.load(f)