# Temporal Fusion Transformers (TFT) Model

In [3]:
pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-1.9.3-py3-none-any.whl (826 kB)
     -------------------------------------- 826.4/826.4 kB 3.5 MB/s eta 0:00:00
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.1-py3-none-any.whl (517 kB)
     -------------------------------------- 517.2/517.2 kB 4.1 MB/s eta 0:00:00
Collecting torch>=1.10.0
  Downloading torch-1.13.1-cp39-cp39-win_amd64.whl (162.5 MB)
     -------------------------------------- 162.5/162.5 MB 2.3 MB/s eta 0:00:00
Collecting lightning-utilities>=0.6.0.post0
  Downloading lightning_utilities-0.7.0-py3-none-any.whl (17 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-win_amd64.whl (323 kB)
     -------------------------------------- 323.6/323.6 kB 3.3 MB/s eta 0:00:00
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-win_amd64.whl (56 kB)
     ---------------------------------------- 56.8/56.8 kB 2.9 MB/s eta 0:00:00
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.

In [2]:
pip install pytorch_forecasting

Collecting pytorch_forecasting
  Downloading pytorch_forecasting-0.10.3-py3-none-any.whl (141 kB)
     -------------------------------------- 141.4/141.4 kB 2.9 MB/s eta 0:00:00
Collecting optuna<3.0.0,>=2.3.0
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
     -------------------------------------- 308.2/308.2 kB 3.8 MB/s eta 0:00:00
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting cliff
  Downloading cliff-4.2.0-py3-none-any.whl (81 kB)
     ---------------------------------------- 81.0/81.0 kB 2.3 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.9.4-py3-none-any.whl (210 kB)
     -------------------------------------- 210.5/210.5 kB 4.3 MB/s eta 0:00:00
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.7/78.7 kB 2.2 MB/s eta 0:00:00
Collecting autopage>=0.4.0
  Downloading autopage-0.5.1-

In [3]:
import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

## Create Dataset and Dataloaders
**Puts data in the format Pytorch TFT understands**

In [None]:
max_prediction_length = 6 # the number of months the predictions will cover 
max_encoder_length = 24 # the number of months the prediction will be based on
training_cutoff = data["insert date variable"].max() - max_prediction_length # sets the training set limit (unitl 2019 or 4 years)

training = TimeSeriesDataSet(
    data[lambda x: x.'insert date variable' <= training_cutoff],
    time_idx= #insert date variable,
    target= # insert target variable (return or risk),
    group_ids=# variables to group by,
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals= # insert categorical variables that dont change,
    static_reals= # insert numerical variables that dont change,
    time_varying_known_categoricals= # insert categorical variables that change predictably (we know what they'll be),
    variable_groups=,  # group of categorical variables can be treated as one variable
    time_varying_known_reals= # insert numerical variables that change predictably (we know what they'll be),
    time_varying_unknown_categoricals=# insert categorical variables that change unpredictably (we don't know what they' ll be),
    time_varying_unknown_reals= # insert numerical variables that change unpredictably (we don't know what they'll be)
    
    target_normalizer=GroupNormalizer(
        groups= # group_ids specified above, 
        transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # the number of observation in training batches. set this between 32 to 128 (preferably a power of 2 for GPU's sake)
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

## Training the TFT model
**Hyperparameter Tuning: Finding the Optimal Learning Rate**

In [None]:
# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=0,
    # clipping gradients is a hyperparameter and important to prevent divergance of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03, 
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1, # parameter for attention_based models
    dropout=0.1,  # the rate at which the model forgets, prevents overfitting. between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=,  # number of quantiles the output (prediction) will be placed. 
                   # This means the model is kind of a classifier as it's grouping output into quantiles. 
                   # Client wants 2 quanitles
    loss=QuantileLoss(), # keep QuantileLoss as loss function 
    reduce_on_plateau_patience= 4, # reduce learning rate if no improvement in validation loss after 4 epochs
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()