# Solving Spaceship 🚀 Titanic dataset with Lightning⚡Flash

Flash makes complex AI recipes for over 15 tasks across 7 data domains accessible to all.
In a nutshell, Flash is the production grade research framework you always dreamed of but didn't have time to build.

https://github.com/PyTorchLightning/lightning-flash

In [None]:
!pip download -q "matplotlib==3.1.1" "pandas==1.3.5" "omegaconf==2.0.*" --dest frozen_packages
!pip download -q pytorch-lightning 'lightning-flash[tabular]>=0.7.0' 'torchmetrics<0.8' "omegaconf==2.1.*" --dest frozen_packages
!pip wheel -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/tabular/mean-std.zip#egg=lightning-flash[tabular]' --wheel-dir frozen_packages --prefer-binary
#!rm frozen_packages/torch-*
!rm frozen_packages/torch-*
!ls -l frozen_packages | grep -e torch -e lightning -e tab

In [None]:
! pip install -q pytorch-lightning lightning-flash[tabular] -U --pre --find-links frozen_packages --no-index
! pip install -q "omegaconf==2.1.*" "matplotlib==3.1.1" "pandas==1.3.5" --force-reinstall
! pip uninstall -y torchtext
! pip list | grep -e lightning -e torch -e tab

In [None]:
%matplotlib inline

import torch
import flash
import pandas as pd
from flash.tabular import TabularClassificationData, TabularClassifier

## 0. Loading the dataset

### Variable & Definition

- **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
- **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- **Destination** - The planet the passenger will be debarking to.
- **Age** - The age of the passenger.
- **VIP** - Whether the passenger has paid for special VIP service during the voyage.
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- **Name** - The first and last names of the passenger.
- **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
display(df_train.head())
display(df_train.dtypes)

In [None]:
df_train[["Cabin-0", "Cabin-1", "Cabin-2"]] = [c.split("/") if isinstance(c, str) else [c] * 3 for c in df_train["Cabin"]]
df_train["Cabin-1"] = df_train["Cabin-1"].apply(float)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axarr = plt.subplots(ncols=2, nrows=3, figsize=(10, 10))
for i, col in enumerate(["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]):
    # plot melted dataframe in a single command
    sns.histplot(df_train, x=col, hue='Transported', multiple='dodge', shrink=.75, bins=20, ax=axarr[i // 2, i % 2])

In [None]:
df_train["Transported"] = df_train["Transported"].apply(int)
_= df_train["Transported"].value_counts().plot.pie()

In [None]:
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
df_test[["Cabin-0", "Cabin-1", "Cabin-2"]] = [c.split("/") if isinstance(c, str) else [c] * 3 for c in df_test["Cabin"]]
df_test["Cabin-1"] = df_test["Cabin-1"].apply(float)
display(df_test.head())
print(len(df_test))

## 1. Create the DataModule

In [None]:
datamodule = TabularClassificationData.from_data_frame(
    categorical_fields=["HomePlanet", "CryoSleep", "Cabin-0", "Cabin-2", "Destination", "VIP"],
    numerical_fields=["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Cabin-1"],
    target_fields="Transported",
    train_data_frame=df_train,
    val_split=0.05,
    batch_size=128,
    predict_data_frame=df_test,
)

In [None]:
from pprint import pprint

pprint(datamodule.parameters)

## 2. Build the task

In [None]:
model = TabularClassifier.from_data(
    datamodule,
    # backbone="fttransformer",
    backbone="tabtransformer",
    optimizer="adamax",
    learning_rate=0.02,
    # lr_scheduler=("StepLR", {"step_size": 250}),
    # lr_scheduler=("cosineannealinglr", {"T_max": 1500}),
    out_ff_activation="SiLU",
    num_attn_blocks=14,
    attn_dropout = 0.2,
    ff_dropout = 0.2,
)

## 3. Create the trainer and train the model

In [None]:
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import StochasticWeightAveraging
from pytorch_lightning import seed_everything

seed_everything(7)
# swa = StochasticWeightAveraging(swa_epoch_start=0.6)
trainer = flash.Trainer(
    max_epochs=35,
    #callbacks=[swa],
    gpus=torch.cuda.device_count(),
    logger=CSVLogger(save_dir='logs/'),
    accumulate_grad_batches=10,
    # gradient_clip_val=0.1,
)

In [None]:
# trainer.tune(model, datamodule=datamodule, lr_find_kwargs=dict(min_lr=1e-5, max_lr=0.1, num_training=65),)
# print(f"Learning Rate: {model.learning_rate}")

# ==============================

trainer.fit(model, datamodule=datamodule)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
# display(metrics.head())
metrics.set_index("step", inplace=True)
del metrics["epoch"]
sns.relplot(data=metrics, kind="line")
plt.gca().set_ylim([0, 1.25])
plt.gcf().set_size_inches(10, 5)

## 4. Generate predictions from a CSV

In [None]:
from itertools import chain

predictions = trainer.predict(model, datamodule=datamodule, output="classes")
# print(predictions)

predictions = list(chain(*predictions))

In [None]:
import numpy as np

print(len(df_test), len(predictions))
assert len(df_test) == len(predictions)

df_test["Transported"] = [str(bool(p)) for p in predictions]
df_test["Transported"].value_counts().plot.pie()

display(df_test.head())
df_test.set_index("PassengerId")[["Transported"]].to_csv("submission.csv")

In [None]:
! head submission.csv