# Solving Amex 💳 dataset with Lightning⚡Flash

Flash makes complex AI recipes for over 15 tasks across 7 data domains accessible to all.
In a nutshell, Flash is the production grade research framework you always dreamed of but didn't have time to build.

https://github.com/PyTorchLightning/lightning-flash

In [None]:
! pip install -q "pytorch-lightning>1.5" lightning-flash[tabular] "omegaconf==2.1.*"
! pip install -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/tabular/mean-std.zip#egg=lightning-flash[tabular]'
! pip install -q "matplotlib==3.1.1" "pandas==1.3.5" --force-reinstall
! pip uninstall -y torchtext
! pip list | grep -e lightning -e torch -e tab

In [None]:
%matplotlib inline

import torch
import flash
import numpy as np
import pandas as pd
import dask.dataframe as dd
from pprint import pprint
from flash.tabular import TabularClassificationData, TabularClassifier

# Load the dataset 🔎

In [None]:
df_labels = dd.read_csv("../input/amex-default-prediction/train_labels.csv", dtype={"target": np.int16}).set_index('customer_ID')
display(df_labels.head())
print(len(df_labels))
df_labels["target"].value_counts().compute().plot.pie()

In [None]:
lut_ids = {id_: i for i, id_ in enumerate(df_labels.index)}
df_labels.index = df_labels.index.map(lut_ids)
display(df_labels.head())

## Load huge DataFrames

with the default Float64 it does not fit to memory so we lower the precision to Float16

In [None]:
with open("../input/amex-default-prediction/train_data.csv") as fp:
    head = fp.readline().strip().split(",")
    # pprint(dict(zip(head, fp.readline().strip().split(","))))
print(head)
col_dtypes = {c: np.float16 for c in head if c not in ["customer_ID", "S_2", "D_63", "D_64"]}

In [None]:
df_train = dd.read_csv("../input/amex-default-prediction/train_data.csv", dtype=col_dtypes).set_index('customer_ID')
df_train.index = df_train.index.map(lut_ids)
display(df_train.head())
print(len(df_train))

## Merge training data with targets

In [None]:
df_train = df_train.merge(df_labels, left_index=True, right_index=True).replace([np.inf, -np.inf], np.nan)
display(df_train.head())
print(len(df_train))
print(len(df_train.index.unique().compute()))

del df_labels

In [None]:
col_counts = df_train.count().compute()
col_counts.sort_values().plot.bar(figsize=(14, 2), grid=True)

In [None]:
df_train.index.value_counts().compute().hist(bins=50)

In [None]:
df_train["S_2"].value_counts().compute().plot.bar(figsize=(24, 3))

## Limit thre training dataset

In [None]:
# ToDo: take only fraction of the training data dues to HW limitations
print(f"table size: {len(df_train)}")
# display(df_train.head())
df_train = df_train.sample(0.4).compute()
print(f"table size: {len(df_train)}")

## Compute some stat parameters

Turned out that mean nor STD canot be succesfully computed with float16

In [None]:
# params = {
#     'mean': {c: np.nanmean(df_train[c], dtype=np.float32) for c in useful_cols},
#     'std': {c: np.nanstd(df_train[c], dtype=np.float32) for c in useful_cols},
#     # 'codes': {},
#     # 'numerical_fields': useful_cols,
#     # 'categorical_fields': [],
# }
# print(params)

# Training the task with Lightning⚡Flash

## 1. Create the DataModule

In [None]:
# this is given by organizers
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# set the remaining as nuerical
numerical_cols = [c for c in col_dtypes if c in df_train.columns and c not in categorical_cols]

In [None]:
datamodule = TabularClassificationData.from_data_frame(
    categorical_fields=categorical_cols,
    numerical_fields=numerical_cols,
    target_fields="target",
    train_data_frame=df_train.fillna(0),
    val_split=0.1,
    batch_size=512,
    # predict_data_frame=df_test.fillna(0),
)

pprint(datamodule.parameters)

## 2. Build the task

In [None]:
# from torchmetrics import F1

model = TabularClassifier.from_data(
    datamodule,
    # backbone="tabnet",
    backbone="tabtransformer",
#     metrics=F1(),
    optimizer="Adamax",
    learning_rate=0.1,
    lr_scheduler=("StepLR", {"step_size": 7500}),
)

## 3. Create the trainer and train the model

In [None]:
from pytorch_lightning.loggers import CSVLogger
# from pytorch_lightning.callbacks import StochasticWeightAveraging
from pytorch_lightning import seed_everything

seed_everything(7)
# swa = StochasticWeightAveraging(swa_epoch_start=0.6)
trainer = flash.Trainer(
    max_epochs=15,
    #callbacks=[swa],
    gpus=torch.cuda.device_count(),
    logger=CSVLogger(save_dir='logs/'),
    accumulate_grad_batches=24,
    # gradient_clip_val=0.1,
    val_check_interval=0.25
)

In [None]:
trainer.fit(model, datamodule=datamodule)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
# display(metrics.head())
metrics.set_index("step", inplace=True)
del metrics["epoch"]
sns.relplot(data=metrics, kind="line")
plt.gca().set_ylim([0, 1.25])
plt.gcf().set_size_inches(10, 5)

In [None]:
del df_train

params = dict(datamodule.parameters)
del datamodule

## 4. Generate predictions from a CSV

In [None]:
! head ../input/amex-default-prediction/sample_submission.csv

In [None]:
!mkdir -p /kaggle/temp

from tqdm.auto import tqdm

head = None
lines = []
counter = 0
pbar = tqdm(desc="Exported CSV tables")
with open("../input/amex-default-prediction/test_data.csv") as fp:
    for line in fp:
        if not head:
            head = line
        else:
            lines.append(line)
        if len(lines) < 100_000:
            continue
        with open(f"/kaggle/temp/test_data_{counter}.csv", "w") as fpp:
            fpp.writelines([head] + lines)
        lines = []
        counter += 1
        pbar.update()

with open(f"/kaggle/temp/test_data_{counter}.csv", "w") as fpp:
    fpp.writelines([head] + lines)

!ls -l /kaggle/temp/test_data_*.csv

In [None]:
import glob

test_files = sorted(glob.glob("/kaggle/temp/test_data_*.csv"))

indexes, predictions = [], []
for tfile in tqdm(test_files, desc="Iterate over Test fractions"):
    df_test = pd.read_csv(tfile, dtype=col_dtypes).set_index('customer_ID')
    indexes += list(df_test.index)
    datamodule = TabularClassificationData.from_data_frame(
        parameters=params,
        batch_size=64,
        predict_data_frame=df_test.fillna(0),
    )
    predictions += trainer.predict(model, datamodule=datamodule, output="classes")

In [None]:
from itertools import chain

df_preds = pd.DataFrame({
    "customer_ID": indexes,
    "prediction": list(chain(*predictions))
})
df_preds["prediction"].value_counts().plot.pie()

In [None]:
df_preds_short = df_preds.groupby("customer_ID").median()
display(df_preds_short.head())
df_preds_short["prediction"].value_counts().plot.pie()

print(len(df_preds_short))
df_preds_short[["prediction"]].to_csv("submission.csv")

In [None]:
! head submission.csv