In [None]:
#| default_exp models.deepar

# DeepAR

The DeepAR methodology produces conditional probabilistic forecasts based on an Seq2Seq autoregressive neural network optimized on panel data using cross-learning.
$$\mathbb{P}(\mathbf{y}_{[t+1:t+H]}|\;\mathbf{y}_{[:t]},\; \mathbf{x}^{(h)}_{[:t]},\; \mathbf{x}^{(f)}_{[:t+H]},\; \mathbf{x}^{(s)})$$

where $\mathbf{x}^{(s)}$ are static exogenous inputs, $\mathbf{x}^{(h)}_{t}$ historic exogenous, $\mathbf{x}^{(f)}_{[:t+H]}$ are future exogenous available at the time of the prediction.

Our version of DeepAR first encodes the temporal variables with a RNN and then obtains its forecast distribution parameters using a recurrent decoder.
The predictions are obtained by transforming the hidden states $\mathbf{h}_{t}$ into recurrent predictive distribution parameters $\theta_{\tau}$.

\begin{align}
\mathbf{h}_{t} &= \textrm{EncoderRNN}([\mathbf{y}_{t},\mathbf{x}^{(h)}_{t},\mathbf{x}^{(s)}], \mathbf{h}_{t-1})\\
\mathbf{\theta}_{\tau+1}&=\textrm{DecoderRNN}([\mathbf{h}_{t}, \mathbf{x}^{(f)}_{\tau}], \mathbf{\theta}_{\tau}) \qquad \tau \in [t+1:t+h]\\
\hat{y}_{\tau} &\sim \;\mathrm{P}(y_{\tau}\;|\;\mathbf{\theta}_{\tau})\;
\end{align}

We deviate minimally from the original DeepAR implementation that in contrast uses a Markov Chain Monte Carlo sampler that is fed into the model inputs
recursively.

**References**<br>
- [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). "DeepAR: Probabilistic forecasting with autoregressive recurrent networks". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>
- [Alexander Alexandrov et. al (2020). "GluonTS: Probabilistic and Neural Time Series Modeling in Python". Journal of Machine Learning Research.](https://www.jmlr.org/papers/v21/19-820.html)<br>

![Figure 1. DeepAR model, during training the optimization signal comes from likelihood of observations, during inference a recurrent multi-step strategy is used to generate predictive distributions.](imgs_models/deepar.png)

In [None]:
#| hide
#TODO: Rethink the usage of Variable Selection Networks
#TODO: Rethink the usage of TFT embeddings.

# Code references
# 1. Seq2Seq Pytorch: https://github.com/bentrevett/pytorch-seq2seq/tree/master
# 2. DeepAR P module: https://github.com/awslabs/gluonts/blob/3145a81ecd751b410148c0230989e0459ce37716/src/gluonts/torch/model/deepar/module.py
# 3. DeepAR PL module: https://github.com/awslabs/gluonts/blob/3145a81ecd751b410148c0230989e0459ce37716/src/gluonts/torch/model/deepar/lightning_module.py#L25# 

In [None]:
#| hide
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

In [None]:
#| export
import torch

import logging
import warnings
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

from typing import Optional

from neuralforecast.losses.pytorch import MAE
from neuralforecast.common._base_windows import BaseWindows
from neuralforecast.common._modules import MLP
from neuralforecast.models.tft import (
    StaticCovariateEncoder, 
    TemporalCovariateEncoder, 
    TFTEmbedding
)

In [None]:
#| hide
from fastcore.test import test_eq
from nbdev.showdoc import show_doc

In [None]:
#| hide
import logging
import warnings
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

In [None]:
#| export
class DeepAR(BaseWindows):
    """ DeepAR

    DeepAR is a method for producing probabilistic forecasts. It uses two recurrent neural networks (RNN) 
    to encode temporal and static variables and generate forecast distribution parameters.
    
    Our version of DeepAR deviates slightly from the original GluonTS approach, which incorporates a 
    Markov Chain sampler in the optimization, while we employ a full Seq2Seq approach.

    **Parameters:**<br>
    `h`: int, Forecast horizon. <br>
    `input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].<br>
    `decoder_layers`: int=2, number of layers for the MLP decoder.<br>    
    `stat_exog_list`: str list, static continuous columns.<br>
    `hist_exog_list`: str list, historic continuous columns.<br>
    `futr_exog_list`: str list, future continuous columns.<br>
    `hidden_size`: int, units of embeddings and encoders.<br>
    `dropout`: float (0, 1), dropout of inputs VSNs.<br>
    `shared_weights`: bool, If True, all blocks within each stack will share parameters. <br>
    `activation`: str, activation from ['ReLU', 'Softplus', 'Tanh', 'SELU', 'LeakyReLU', 'PReLU', 'Sigmoid'].<br>
    `loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>
    `valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>
    `max_steps`: int=1000, maximum number of training steps.<br>
    `learning_rate`: float=1e-3, Learning rate between (0, 1).<br>
    `num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.<br>
    `early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.<br>
    `val_check_steps`: int=100, Number of training steps between every validation loss check.<br>
    `batch_size`: int, number of different series in each batch.<br>
    `windows_batch_size`: int=None, windows sampled from rolled data, default uses all.<br>
    `valid_batch_size`: int=None, number of different series in each validation and test batch.<br>
    `step_size`: int=1, step size between each window of temporal data.<br>
    `scaler_type`: str='robust', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>
    `random_seed`: int, random seed initialization for replicability.<br>
    `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.<br>
    `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.<br>
    `alias`: str, optional,  Custom name of the model.<br>
    `**trainer_kwargs`: int,  keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).<br>    

    **References**<br>
    - [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). "DeepAR: Probabilistic forecasting with autoregressive recurrent networks". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>
    - [Alexander Alexandrov et. al (2020). "GluonTS: Probabilistic and Neural Time Series Modeling in Python". Journal of Machine Learning Research.](https://www.jmlr.org/papers/v21/19-820.html)<br>
    """
    # Class attributes
    SAMPLING_TYPE = 'windows'
    
    def __init__(self,
                 h,
                 input_size,
                 tgt_size: int = 1,
                 decoder_layers: int=2,
                 stat_exog_list = None,
                 hist_exog_list = None,
                 futr_exog_list = None,
                 hidden_size: int = 128,
                 dropout: float = 0.1,
                 loss = MAE(),
                 valid_loss = None,
                 max_steps: int = 1000,
                 learning_rate: float = 1e-3,
                 num_lr_decays: int = -1,
                 early_stop_patience_steps: int =-1,
                 val_check_steps: int = 100,
                 batch_size: int = 32,
                 valid_batch_size: Optional[int] = None,
                 windows_batch_size: int = 1024,
                 inference_windows_batch_size: int = 1024,
                 step_size: int = 1,
                 scaler_type: str = 'robust',
                 num_workers_loader = 0,
                 drop_last_loader = False,
                 random_seed: int = 1,
                 **trainer_kwargs
                 ):

        # Inherit BaseWindows class
        super(DeepAR, self).__init__(h=h,
                                  input_size=input_size,
                                  loss=loss,
                                  valid_loss=valid_loss,
                                  max_steps=max_steps,
                                  learning_rate=learning_rate,
                                  num_lr_decays=num_lr_decays,
                                  early_stop_patience_steps=early_stop_patience_steps,
                                  val_check_steps=val_check_steps,
                                  batch_size=batch_size,
                                  valid_batch_size=valid_batch_size,
                                  windows_batch_size=windows_batch_size,
                                  inference_windows_batch_size=inference_windows_batch_size,
                                  step_size=step_size,
                                  scaler_type=scaler_type,
                                  num_workers_loader=num_workers_loader,
                                  drop_last_loader=drop_last_loader,
                                  random_seed=random_seed,
                                  **trainer_kwargs)
        self.example_length = input_size + h
        self.decoder_layers = decoder_layers

        # Parse lists hyperparameters
        self.stat_exog_list = [] if stat_exog_list is None else stat_exog_list
        self.hist_exog_list = [] if hist_exog_list is None else hist_exog_list
        self.futr_exog_list = [] if futr_exog_list is None else futr_exog_list

        stat_input_size = len(self.stat_exog_list)
        futr_input_size = max(len(self.futr_exog_list), 1)
        hist_input_size = len(self.hist_exog_list)
        num_historic_vars = futr_input_size + hist_input_size + tgt_size

        #------------------------------- Encoders -----------------------------#
        self.embedding = TFTEmbedding(hidden_size=hidden_size,
                                      stat_input_size=stat_input_size,
                                      futr_input_size=futr_input_size,
                                      hist_input_size=hist_input_size,
                                      tgt_size=tgt_size)
        
        self.static_encoder = StaticCovariateEncoder(
                                      hidden_size=hidden_size,
                                      num_static_vars=stat_input_size,
                                      dropout=dropout)

        self.temporal_encoder = TemporalCovariateEncoder(
                                      hidden_size=hidden_size,
                                      num_historic_vars=num_historic_vars,
                                      num_future_vars=futr_input_size,
                                      dropout=dropout)

        #------------------------------ Decoders -----------------------------#
        # Decoder MLP
        #self.enrichment_grn = GRN(input_size=hidden_size,
        #                          hidden_size=hidden_size,
        #                          context_hidden_size=hidden_size, 
        #                          dropout=dropout)
        self.mlp_decoder = MLP(in_features=hidden_size,
                               out_features=self.loss.outputsize_multiplier,
                               hidden_size=hidden_size,
                               num_layers=self.decoder_layers,
                               activation='ReLU',
                               dropout=dropout)

    def forward(self, windows_batch):

        # Parsiw windows_batch
        y_insample = windows_batch['insample_y'][:,:, None] # <- [B,T,1]
        futr_exog  = windows_batch['futr_exog']
        hist_exog  = windows_batch['hist_exog']
        stat_exog  = windows_batch['stat_exog']

        if futr_exog is None:
            futr_exog = y_insample[:, [-1]]
            futr_exog = futr_exog.repeat(1, self.example_length, 1)

        s_inp, k_inp, o_inp, t_observed_tgt = self.embedding(target_inp=y_insample, 
                                                             hist_exog=hist_exog,
                                                             futr_exog=futr_exog,
                                                             stat_exog=stat_exog)

        #-------------------------------- Inputs ------------------------------#
        # Static context
        if s_inp is not None:
            #cs, ce, ch, cc = self.static_encoder(s_inp)
            cs, _, ch, cc = self.static_encoder(s_inp)
            ch, cc = ch.unsqueeze(0), cc.unsqueeze(0) # LSTM initial states
        else:
            # If None add zeros
            batch_size, example_length, target_size, hidden_size = t_observed_tgt.shape
            cs = torch.zeros(size=(batch_size, hidden_size)).to(y_insample.device)
            #ce = torch.zeros(size=(batch_size, hidden_size)).to(y_insample.device)
            ch = torch.zeros(size=(1, batch_size, hidden_size)).to(y_insample.device)
            cc = torch.zeros(size=(1, batch_size, hidden_size)).to(y_insample.device)

        # Historical inputs
        _historical_inputs = [k_inp[:,:self.input_size,:],
                              t_observed_tgt[:,:self.input_size,:]]
        if o_inp is not None:
            _historical_inputs.insert(0,o_inp[:,:self.input_size,:])
        historical_inputs = torch.cat(_historical_inputs, dim=-2)

        # Future inputs
        future_inputs = k_inp[:, self.input_size:]

        #---------------------------- Encode/Decode ---------------------------#
        # Embeddings + VSN + LSTM encoders
        temporal_features = self.temporal_encoder(historical_inputs=historical_inputs,
                                                  future_inputs=future_inputs,
                                                  cs=cs, ch=ch, cc=cc)

        # Rethink fusion with static variables, to enrich temporal features (see TFT)
        # In the abscence of temporal_fusion_decoder residual connections still needed
        temporal_features = temporal_features[:, self.input_size:, :]
        #temporal_features = self.enrichment_grn(temporal_features, c=ce)

        # Adapt output to loss
        output = self.mlp_decoder(temporal_features)
        output = self.loss.domain_map(output)

        return output

In [None]:
show_doc(DeepAR, title_level=3)

In [None]:
show_doc(DeepAR.fit, name='DeepAR.fit', title_level=3)

In [None]:
show_doc(DeepAR.predict, name='DeepAR.predict', title_level=3)

## Usage Example

In [None]:
from neuralforecast import NeuralForecast
from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM
from neuralforecast.tsdataset import TimeSeriesDataset
from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic

In [None]:
#| eval: false
import pandas as pd
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from neuralforecast import NeuralForecast
#from neuralforecast.models import DeepAR
from neuralforecast.losses.pytorch import DistributionLoss, HuberMQLoss
from neuralforecast.tsdataset import TimeSeriesDataset
from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic

#AirPassengersPanel['y'] = AirPassengersPanel['y'] + 10
Y_train_df = AirPassengersPanel[AirPassengersPanel.ds<AirPassengersPanel['ds'].values[-12]] # 132 train
Y_test_df = AirPassengersPanel[AirPassengersPanel.ds>=AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) # 12 test

nf = NeuralForecast(
    models=[DeepAR(h=12, input_size=48,
                   hidden_size=10,
                   #loss=DistributionLoss(distribution='Poisson', level=[80, 90]),
                   #loss=DistributionLoss(distribution='Normal', level=[80, 90]),
                   loss=DistributionLoss(distribution='StudentT', level=[80, 90]),
                   #loss=HuberMQLoss(level=[80, 90]),
                   learning_rate=0.005,
                   stat_exog_list=['airline1'],
                   #futr_exog_list=['y_[lag12]'],
                   hist_exog_list=['trend'],
                   max_steps=500,
                   val_check_steps=10,
                   early_stop_patience_steps=10,
                   scaler_type='robust',
                   windows_batch_size=None,
                   enable_progress_bar=True),
    ],
    freq='M'
)
nf.fit(df=Y_train_df, static_df=AirPassengersStatic, val_size=12)
Y_hat_df = nf.predict(futr_df=Y_test_df)

# Plot quantile predictions
Y_hat_df = Y_hat_df.reset_index(drop=False).drop(columns=['unique_id','ds'])
plot_df = pd.concat([Y_test_df, Y_hat_df], axis=1)
plot_df = pd.concat([Y_train_df, plot_df])

plot_df = plot_df[plot_df.unique_id=='Airline1'].drop('unique_id', axis=1)
plt.plot(plot_df['ds'], plot_df['y'], c='black', label='True')
#plt.plot(plot_df['ds'], plot_df['DeepAR'], c='purple', label='mean')
plt.plot(plot_df['ds'], plot_df['DeepAR-median'], c='blue', label='median')
plt.fill_between(x=plot_df['ds'][-12:], 
                 y1=plot_df['DeepAR-lo-90'][-12:].values, 
                 y2=plot_df['DeepAR-hi-90'][-12:].values,
                 alpha=0.4, label='level 90')
plt.legend()
plt.grid()
plt.plot()