# Playing tabular📋data with Lightning⚡Flash

Flash makes complex AI recipes for over 15 tasks across 7 data domains accessible to all.
In a nutshell, Flash is the production grade research framework you always dreamed of but didn't have time to build.

https://github.com/PyTorchLightning/lightning-flash

In [None]:
# ! pip install -q lightning-flash[tabular]
# this is just temporal bypass till the improvemnt is merged and released
! pip install -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/master.zip#egg=lightning-flash[tabular]'
! pip install -q "pandas" --force-reinstall
! pip list | grep -E "lightning|torch|tab"

In [None]:
%matplotlib inline

import torch
import flash
import pandas as pd
from flash.tabular import TabularClassificationData, TabularClassifier

PATH_CSV_TRAIN = "/kaggle/input/tabular-playground-series-nov-2021/train.csv"
PATH_CSV_TEST = "/kaggle/input/tabular-playground-series-nov-2021/test.csv"

In [None]:
df_train = pd.read_csv(PATH_CSV_TRAIN, index_col="id")
display(df_train.head())
df_train["target"].hist(bins=2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

corr = df_train.corr()
plt.subplots(figsize=(15, 12))
sns.heatmap(corr, vmax=0.9, square=True)

## 1. Create the DataModule

In [None]:
# drop the target column
del df_train["target"]
# get column types
col_types = dict(df_train.dtypes)
_is_num = lambda x: x in (int, float)
# separate them
cols_numerical = [col for col, dtp in col_types.items() if _is_num(dtp)]
cols_string = [col for col, dtp in col_types.items() if not _is_num(dtp)]
print(f"NUMERICAL: {cols_numerical}")
print(f"CATEGORICAL: {cols_string}")

In [None]:
datamodule = TabularClassificationData.from_csv(
    categorical_fields=None,
    numerical_fields=cols_numerical,
    target_fields="target",
    train_file=PATH_CSV_TRAIN,
    val_split=0.2,
    batch_size=512,
)

## 2. Build the task

In [None]:
model = TabularClassifier.from_data(
    datamodule,
    learning_rate=0.005,
    optimizer="Adam",
    lr_scheduler=("StepLR", {"step_size": 1000}),
    n_a=64,
    gamma=3.0,
)

## 3. Create the trainer and train the model

In [None]:
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import StochasticWeightAveraging
from pytorch_lightning import seed_everything

seed_everything(7)
swa = StochasticWeightAveraging(swa_epoch_start=0.6)
logger = CSVLogger(save_dir='logs/')
trainer = flash.Trainer(
    max_epochs=20,
    gpus=torch.cuda.device_count(),
    logger=logger,
    callbacks=[swa],
    accumulate_grad_batches=12,
    gradient_clip_val=0.1,
#     auto_lr_find=True,
)

# ==============================

# trainer.tune(model, datamodule=datamodule, lr_find_kwargs=dict(min_lr=1e-5, max_lr=0.1, num_training=65),)
# print(f"Learning Rate: {model.learning_rate}")

# ==============================

trainer.fit(model, datamodule=datamodule)

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
display(metrics.head())
metrics.set_index("step", inplace=True)
del metrics["epoch"]
sns.relplot(data=metrics, kind="line")
plt.gca().set_ylim([0, 1.25])
plt.gcf().set_size_inches(10, 5)

## 4. Generate predictions from a CSV

In [None]:
df_test = pd.read_csv(PATH_CSV_TEST, index_col="id")
display(df_test.head())

In [None]:
from flash.core.classification import Probabilities

model.output = Probabilities()
predictions = model.predict(PATH_CSV_TEST)
print(predictions[0])

In [None]:
import numpy as np
assert len(df_test) == len(predictions)

df_test["target"] = np.array(predictions)[:, -1]
df_test["target"].hist(bins=20)

display(df_test.head())
df_test[["target"]].to_csv("submission.csv")

In [None]:
! head submission.csv