# House 🏠 prices predictions with Lightning⚡Flash

[Flash](https://lightning-flash.readthedocs.io/en/stable) makes complex AI recipes for over 15 tasks across 7 data domains accessible to all.

In a nutshell, Flash is the production grade research framework you always dreamed of but didn't have time to build.

In [None]:
! ls -l /kaggle/input/house-prices-advanced-regression-techniques
! pip list | grep -E "lightning|torch"

In [None]:
# ! pip uninstall -y torchtext fastai
! pip install -q -U "lightning-flash[tabular]==0.7.*" "omegaconf==2.1.*"
! pip install -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/tabular/mean-std.zip#egg=lightning-flash[tabular]'
# this is just temporal bypass till the improvemnt is merged and released
# ! pip install -q 'https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/master.zip#egg=lightning-flash[tabular]'
# ! pip install -q icevision[all]
! pip install -q --upgrade pandas --force-reinstall
! pip list | grep -E "lightning|torch|tab"

## Browsing dataset

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

PATH_DATASET = "/kaggle/input/house-prices-advanced-regression-techniques"
CSV_TABLE_TRAIN = os.path.join(PATH_DATASET, "train.csv")
CSV_TABLE_TEST = os.path.join(PATH_DATASET, "test.csv")
CSV_TABLE_SUB = os.path.join(PATH_DATASET, "sample_submission.csv")

df_train = pd.read_csv(CSV_TABLE_TRAIN, index_col="Id")
display(df_train.head())
print(len(df_train))

ax = df_train["SalePrice"].plot.hist(bins=50)
_= plt.xticks(rotation=45)

In [None]:
import seaborn as sns
sns.set()

corr = df_train.corr()
plt.subplots(figsize=(15, 12))
sns.heatmap(corr, vmax=0.9, square=True)

In [None]:
import numpy as np

# get column types
col_counts = dict(df_train.count(axis=0))
col_types = {c: d for c, d in dict(df_train.dtypes).items() if col_counts[c] > 1000 and c != 'SalePrice'}
_is_num = lambda x: x in (int, float)
# separate them
cols_numerical = sorted([col for col, dtp in col_types.items() if _is_num(dtp)])
# drop empty
# cols_numerical = [c for c in cols_numerical if np.nanstd(df_train[c]) != 0]
cols_string = sorted([col for col, dtp in col_types.items() if not _is_num(dtp)])
print(f"NUMERICAL: {cols_numerical}")
print(f"CATEGORICAL: {cols_string}")

In [None]:
df_test = pd.read_csv(CSV_TABLE_TEST, index_col="Id")
display(df_test.head())
print(len(df_test))

## Training with Flash Lightning

In [None]:
import flash
from flash.tabular import TabularRegressionData, TabularRegressor

### 1. Create the DataModule

In [None]:
price_mean = np.nanmean(df_train["SalePrice"])
price_std = np.nanmean(df_train["SalePrice"])
print(f"mean: {price_mean} ; STD: {price_std}")

df_train["SalePrice"] = (df_train["SalePrice"] - price_mean) / price_std

In [None]:
datamodule = TabularRegressionData.from_data_frame(
    categorical_fields=cols_string,
    numerical_fields=cols_numerical,
    target_field="SalePrice",
    train_data_frame=df_train,  # .fillna(0)
    predict_data_frame=df_test,  # .fillna(0)
    val_split=0.1,
    batch_size=64,
)

In [None]:
from pprint import pprint

pprint(datamodule.parameters)

### 2. Build the task

In [None]:
TabularRegressor.available_backbones()

In [None]:
model = TabularRegressor.from_data(
    datamodule,
    # backbone="node",
    backbone="fttransformer",
    learning_rate=0.0005,
    optimizer="Adamax",
    lr_scheduler=("StepLR", {"step_size": 2000}),
#     n_a=64,
#     gamma=0.1,
)

### 3. Create the trainer and train the mode

In [None]:
import torch
from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import StochasticWeightAveraging

# seed_everything(42)
swa = StochasticWeightAveraging(swa_epoch_start=0.6)
trainer = flash.Trainer(
    max_epochs=350,
    gpus=torch.cuda.device_count(),
    logger=CSVLogger(save_dir='logs/'),
    callbacks=[swa],
    accumulate_grad_batches=4,
    gradient_clip_val=0.1,
#     auto_lr_find=True,
)

In [None]:
# trainer.tune(model, datamodule=datamodule, lr_find_kwargs=dict(min_lr=2e-4, max_lr=1, num_training=65),)
# print(f"Learning Rate: {model.learning_rate}")

# ==============================

trainer.fit(model, datamodule=datamodule)

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')
# display(metrics.head())
metrics.set_index("step", inplace=True)
del metrics["epoch"]
g = sns.relplot(data=metrics, kind="line")
g.set(yscale="log")
plt.gcf().set_size_inches(10, 5)

### 4. Generate predictions from a CSV

In [None]:
df_sub = pd.read_csv(CSV_TABLE_SUB, index_col="Id")
display(df_sub.head())
print(len(df_sub))

In [None]:
from itertools import chain

results = flash.Trainer().predict(model, datamodule=datamodule)
# price_mean = datamodule.parameters["mean"]["SalePrice"]
# price_std = datamodule.parameters["std"]["SalePrice"]
predictions = [int(p.item() * price_std + price_mean) for p in list(chain(*results))]  # 
# print(predictions)

# show prediction prices histogram
ax = pd.read_csv(CSV_TABLE_TRAIN)["SalePrice"].plot.hist(bins=50)
pd.Series(predictions).plot.hist(ax=ax, bins=50, figsize=(8, 3), alpha=0.7)
_= plt.xticks(rotation=45)

In [None]:
df_test["SalePrice"] = predictions
df_test["SalePrice"].to_csv("submission.csv")

In [None]:
! head submission.csv