# Using Pytorch Forecasting to train a TemporalFusionTransformer

**Litteraly the most swag name for a Machine Learning Model** 🙌

=> To understand what's going on under this crazy appelation : [video explanation ](https://www.youtube.com/watch?v=M7O4VqRf8s4) 

=> This is litterally a copy of the [Pytorch Forecasting Doc](https://pytorch-forecasting.readthedocs.io/en/latest/tutorials/stallion.html), all rights reserved to the authors 😃

### Table of contents 
- [Loading and merging data](#section-1) 
- [Training TFT ](#section-2)
- [Prediction and submission](#section-3)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 
path = Path('/kaggle/input/store-sales-time-series-forecasting/')

In [None]:
!pip install pytorch-forecasting

<a id="section-1"></a>
# Loading Data 

In [None]:
df_train = pd.read_csv(path / 'train.csv', parse_dates=['date'], infer_datetime_format=True)
df_train.shape

In [None]:
df_test = pd.read_csv(path / 'test.csv', parse_dates=['date'], infer_datetime_format=True)
df_test['date'].value_counts()

In [None]:
prediction_steps = df_test['date'].nunique()

### Stores

In [None]:
stores = pd.read_csv(path / 'stores.csv')
stores = stores.rename(columns={"type": "store_type", 'cluster': 'store_cluster'})
stores.head()

In [None]:
df_train = pd.merge(df_train, stores, on='store_nbr', how='left')
df_train.shape


### Holidays

In [None]:
holidays = pd.read_csv(path / 'holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)
#Keep only not transferrred holidays
holidays = holidays.loc[holidays['transferred'] == False]

holidays_nat = holidays[holidays['locale']=='National'].drop_duplicates(subset='date')
holidays_reg = holidays[holidays['locale']=='Regional'].drop_duplicates(subset='date')
holidays_loc = holidays[holidays['locale']=='Local'].drop_duplicates(subset='date')

df_train = pd.merge(df_train, holidays_nat[['date','description']], on='date', how='left').rename(columns={'description': 'holiday_nat'})
df_train = pd.merge(df_train, holidays_reg[['date', 'locale_name', 'description']], left_on=['date', 'state'], right_on=['date', 'locale_name'], how='left').rename(columns={'description': 'holiday_reg'}).drop(columns=['locale_name'])
df_train = pd.merge(df_train, holidays_loc[['date', 'locale_name', 'description']], left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left').rename(columns={'description': 'holiday_loc'}).drop(columns=['locale_name'])

df_train[["holiday_nat", "holiday_reg", "holiday_loc"]] = df_train[["holiday_nat", "holiday_reg", "holiday_loc"]].fillna("No")

df_train.shape

### Oil

In [None]:
oil = pd.read_csv(path/ 'oil.csv', parse_dates=['date'], infer_datetime_format=True)

In [None]:
df_train = pd.merge(df_train, oil, on='date', how='left')
df_train.shape

### Transactions

In [None]:
transactions = pd.read_csv(path/ 'transactions.csv', parse_dates=['date'], infer_datetime_format=True)
transactions.tail()

In [None]:
df_train = pd.merge(df_train, transactions, on=['store_nbr', 'date'], how='left')
df_train.shape

### EarthQuake 

From the info of the competition the earthquake from 2016 April 16 had an impact on sales so we model this using a variable that is a skewed distribution after the event (time for the help to come and slowly fading out)

In [None]:
from scipy.stats import skewnorm
earthquake = pd.DataFrame()
earthquake["date"] = pd.date_range("2016-04-17","2016-05-16")
earthquake['earthquake_effect'] =  [2*skewnorm.pdf(i/20, 0.5) for i in range(len(earthquake))]


df_train = pd.merge(df_train, earthquake, on='date', how='left')
df_train['earthquake_effect'].fillna(0, inplace=True)
df_train.shape

In [None]:
df_train

In [None]:
import plotly.express as px
px.line(df_train[(df_train['date'] > pd.to_datetime("2016-03-16"))&(df_train['date'] < pd.to_datetime("2016-06-16"))&(df_train['store_nbr']==2)&(df_train['family']=='AUTOMOTIVE')], x='date', y=['earthquake_effect', 'sales'])

### Payday 

In [None]:

def get_distance_from_paydays(date):
    end_of_month = date.daysinmonth
    distance_to_1st = 0 if date.day >=15 else 15 - date.day
    distance_to15th = 0 if date.day < 15 else end_of_month - date.day
    return distance_to_1st + distance_to15th

df_train['days_from_payday'] = df_train['date'].apply(get_distance_from_paydays)

### Derivates from sales 

In [None]:
df_train['average_sales_by_family'] = df_train.groupby(["date", 'family'], observed=True).sales.transform('mean')
df_train['average_sales_by_store'] = df_train.groupby(["date", 'store_nbr'], observed=True).sales.transform('mean')

### Casting and preparing for Pytorch Forecasting TimeSeriesDataSet

In [None]:
df_train['dcoilwtico'] = df_train['dcoilwtico'].interpolate().fillna(method='bfill')
df_train['transactions'] = df_train['transactions'].interpolate().fillna(method='bfill')
df_train['dayofweek'] = df_train['date'].dt.dayofweek.astype('str').astype('category')
df_train['month'] = df_train['date'].dt.month.astype('str').astype('category')
df_train['dayofyear'] = df_train['date'].dt.dayofyear.astype('str').astype('category')

for cat_col in ['holiday_nat', 'holiday_reg', 'holiday_loc','city','state' , 'store_type', 'store_cluster', 'store_nbr', 'family']:
    df_train[cat_col] = df_train[cat_col].astype(str).astype('category')


df_train['time_idx'] = (df_train['date'].dt.date - df_train['date'].dt.date.min()).dt.days

In [None]:
df_train.info()

In [None]:
df_train.isna().sum()

<a id="section-2"></a>
## Pytorch Forecasting : Training/Tuning/Evaluating TFT 

### Build Dataset

In [None]:
from pytorch_forecasting import TimeSeriesDataSet, Baseline, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer

In [None]:
max_prediction_length = prediction_steps
max_encoder_length = 60 # Go back  60 Days 
training_cutoff = df_train["time_idx"].max() - max_prediction_length

In [None]:


training = TimeSeriesDataSet(
    df_train[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="sales",
    group_ids=["store_nbr", "family"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["store_nbr", 
                         "family", 
                         "city", 
                        # "state", 
                         "store_cluster", 
                         "store_type"],
    time_varying_known_categoricals=["holiday_nat", 
                                     "holiday_reg", 
                                     "holiday_loc", 
                                     "month", 
                                     "dayofweek",
                                     "dayofyear"
                                    ],
    time_varying_known_reals=["time_idx", "onpromotion", 'days_from_payday', 'dcoilwtico', "earthquake_effect"
],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "sales",
       # "transactions",
        "average_sales_by_family",
        "average_sales_by_store",
    ],
    target_normalizer=GroupNormalizer(
        groups=["store_nbr", "family"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, df_train, predict=True, stop_randomization=True)


In [None]:

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2, drop_last=True)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=2,drop_last=True)

### Baseline

In [None]:
import torch

# calculate baseline mean absolute error, i.e. predict next value as the last available value from the history
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

In [None]:
import pytorch_lightning as pl
from pytorch_forecasting.metrics import QuantileLoss


### Hyperparameter Optimization

In [None]:
import pickle

from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# create study
study = optimize_hyperparameters(
    train_dataloader,
    val_dataloader,
    model_path="optuna_test",
    n_trials=100,
    max_epochs=30,
    gradient_clip_val_range=(0.01, 1.0),
    hidden_size_range=(8, 64),
    hidden_continuous_size_range=(8, 64),
    attention_head_size_range=(1, 4),
    learning_rate_range=(0.001, 0.1),
    dropout_range=(0.1, 0.3),
    trainer_kwargs=dict(limit_train_batches=30, log_every_n_steps=15, gpus=1),
    reduce_on_plateau_patience=4,
    use_learning_rate_finder=False,  # use Optuna to find ideal learning rate or use in-built learning rate finder
    timeout=7200
)



# show best hyperparameters
print(study.best_trial.params)

### Retrain A full Model 

In [None]:
#Early Stopping 
MIN_DELTA  = 1e-4
PATIENCE = 10

#PL Trainer
MAX_EPOCHS = 150
GPUS = 1
GRADIENT_CLIP_VAL=study.best_trial.params['gradient_clip_val']
LIMIT_TRAIN_BATCHES=30

#Fusion Transformer
LR = study.best_trial.params['learning_rate']
HIDDEN_SIZE = study.best_trial.params['hidden_size']
DROPOUT = study.best_trial.params['dropout']
ATTENTION_HEAD_SIZE = study.best_trial.params['attention_head_size']
HIDDEN_CONTINUOUS_SIZE = study.best_trial.params['hidden_continuous_size']
OUTPUT_SIZE=7
REDUCE_ON_PLATEAU_PATIENCE=5


In [None]:
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=MIN_DELTA, patience=PATIENCE, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate

trainer = pl.Trainer(
    max_epochs=MAX_EPOCHS,
    gpus=GPUS,
    weights_summary="top",
    gradient_clip_val=GRADIENT_CLIP_VAL,
    limit_train_batches=LIMIT_TRAIN_BATCHES,#oment in for training, running valiation every 30 batches
    #fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    log_every_n_steps=10
    
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=LR,
    hidden_size=HIDDEN_SIZE,
    attention_head_size=ATTENTION_HEAD_SIZE,
    dropout=DROPOUT,
    hidden_continuous_size=HIDDEN_CONTINUOUS_SIZE,
    output_size=OUTPUT_SIZE,# 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=REDUCE_ON_PLATEAU_PATIENCE,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
import tensorflow as tf 
import tensorboard as tb 
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile



In [None]:
# fit network
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

### Evaluation

In [None]:
# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
# calcualte mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
(actuals - predictions).abs().mean()

A lot better than baseline !

In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)

In [None]:
for idx in range(5):  # plot 10 examples
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);

In [None]:
predictions, x = best_tft.predict(val_dataloader, return_x=True)
predictions_vs_actuals = best_tft.calculate_prediction_actual_by_variable(x, predictions)
best_tft.plot_prediction_actual_by_variable(predictions_vs_actuals);

In [None]:
df_test

<a id="section-3"></a>
# Prediction

In [None]:
df_test = pd.merge(df_test, stores, on='store_nbr')



In [None]:
df_test

In [None]:

df_test = pd.merge(df_test, holidays_nat[['date','description']], on='date', how='left').rename(columns={'description': 'holiday_nat'})
df_test = pd.merge(df_test, holidays_reg[['date', 'locale_name', 'description']], left_on=['date', 'state'], right_on=['date', 'locale_name'], how='left').rename(columns={'description': 'holiday_reg'}).drop(columns=['locale_name'])
df_test = pd.merge(df_test, holidays_loc[['date', 'locale_name', 'description']], left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left').rename(columns={'description': 'holiday_loc'}).drop(columns=['locale_name'])

df_test[["holiday_nat", "holiday_reg", "holiday_loc"]] = df_test[["holiday_nat", "holiday_reg", "holiday_loc"]].fillna("No")

df_test = pd.merge(df_test, oil, on='date', how='left')
df_test['dcoilwtico'] = df_test['dcoilwtico'].interpolate().fillna(method='bfill')

df_test['dayofweek'] = df_test['date'].dt.dayofweek.astype('str').astype('category')
df_test['month'] = df_test['date'].dt.month.astype('str').astype('category')
df_test['dayofyear'] = df_test['date'].dt.dayofyear.astype('str').astype('category')


for cat_col in ['holiday_nat', 'holiday_reg', 'holiday_loc','city','state' , 'store_type', 'store_cluster', 'store_nbr', 'family']:
    df_test[cat_col] = df_test[cat_col].astype(str).astype('category')

df_test['earthquake_effect'] = 0

df_test['days_from_payday'] = df_test['date'].apply(get_distance_from_paydays)
df_test['time_idx'] = (df_test['date'].dt.date - df_train['date'].dt.date.min()).dt.days

In [None]:
df_test

In [None]:
# select last 30 days from data (max_encoder_length is 24)
encoder_data = df_train[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]


last_data = df_train[df_train['time_idx'].isin([idx  -  prediction_steps for idx in df_test['time_idx'].unique()])]
last_data['time_idx'] = last_data['time_idx'] + prediction_steps
decoder_data = pd.merge(df_test[[col for col in df_test.columns if 'sales' not in col]], 
        last_data[['time_idx','store_nbr', 'family', 'sales', 'average_sales_by_family', 'average_sales_by_store' , 'transactions']],
        on = ['time_idx', 'store_nbr', 'family',]
        )

# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

In [None]:
decoder_data

In [None]:
new_raw_predictions, new_x = best_tft.predict(new_prediction_data, mode="raw", return_x=True)

for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(new_x, new_raw_predictions, idx=idx, show_future_observed=False);

In [None]:
interpretation = best_tft.interpret_output(new_raw_predictions, reduction="sum")
best_tft.plot_interpretation(interpretation)

### Reformat predictions for submission

In [None]:
predictions = best_tft.predict(new_prediction_data, mode="prediction", return_x=False)


In [None]:
predictions = pd.DataFrame(predictions.numpy()).T
predictions['date'] = sorted(df_test['date'].unique())
predictions = pd.melt(predictions, id_vars=['date'])
predictions = predictions.sort_values(['date', 'variable']).reset_index(drop=True)
df_test[['date', 'id', 'store_nbr', 'family']].sort_values(['date', 'store_nbr', 'family']).reset_index(drop=True)
df_test = df_test.join(predictions['value'])

In [None]:
import plotly.graph_objects as go 
import plotly.express as px
list_colors = px.colors.qualitative.Plotly
def plot_train_prediction(df_train, df_predictions, store="3", n_families=10, date_begin="2017-07-15", pred_time_col='date' , pred_col='value'):
    df_train_viz = df_train[(df_train['date'] > pd.to_datetime("2017-07-15"))&(df_train['store_nbr']==store)]
    fig = go.Figure()

    for i, family in enumerate(df_train_viz['family'].unique()[:10]):
        train = df_train_viz[df_train_viz['family']==family]
        pred = df_predictions[(df_predictions['family']==family)&(df_predictions['store_nbr']==store)]
        fig.add_trace(go.Scatter(x =train["date"], y=train["sales"], mode='lines', name=f'{family}_train', line=dict(color=list_colors[i])))
        fig.add_trace(go.Scatter(x =pred[pred_time_col], y=pred[pred_col], mode='lines', name=f'{family}_pred',  line=dict(color=list_colors[i])))

    fig.show()
    
plot_train_prediction(df_train, df_test, store="4")

In [None]:
df_test[df_test['family']=='BOOKS', 'value'] = 0 # SEEMs to be 0 Sales of books 
df_test[['id', 'value']].rename(columns={"value": "sales"}).to_csv('submission.csv', index=False)