In [75]:
#| default_exp models.deepar

# DeepAR

The DeepAR methodology produces conditional probabilistic forecasts based on an Seq2Seq autoregressive neural network optimized on panel data using cross-learning.
$$\mathbb{P}(\mathbf{y}_{[t+1:t+H]}|\;\mathbf{y}_{[:t]},\; \mathbf{x}^{(h)}_{[:t]},\; \mathbf{x}^{(f)}_{[:t+H]},\; \mathbf{x}^{(s)})$$

where $\mathbf{x}^{(s)}$ are static exogenous inputs, $\mathbf{x}^{(h)}_{t}$ historic exogenous, $\mathbf{x}^{(f)}_{[:t+H]}$ are future exogenous available at the time of the prediction.


**References**<br>
- [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). "DeepAR: Probabilistic forecasting with autoregressive recurrent networks". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>
- [Alexander Alexandrov et. al (2020). "GluonTS: Probabilistic and Neural Time Series Modeling in Python". Journal of Machine Learning Research.](https://www.jmlr.org/papers/v21/19-820.html)<br>

<!-- ![Figure 1. DeepAR model, during training the optimization signal comes from likelihood of observations, during inference a recurrent multi-step strategy is used to generate predictive distributions.](imgs_models/deepar.png) -->

In [76]:
#| export
import numpy as np

import torch
import torch.nn as nn

import logging
import warnings
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

from typing import Optional

from neuralforecast.losses.pytorch import MAE
from neuralforecast.common._base_windows import BaseWindows

In [77]:
#| hide
from fastcore.test import test_eq
from nbdev.showdoc import show_doc

In [78]:
#| hide
import logging
import warnings
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

In [79]:
#| export
class Decoder(nn.Module):
    """Multi-Layer Perceptron Decoder

    **Parameters:**<br>
    `in_features`: int, dimension of input.<br>
    `out_features`: int, dimension of output.<br>
    `hidden_size`: int, dimension of hidden layers.<br>
    `num_layers`: int, number of hidden layers.<br>
    """

    def __init__(self, in_features, out_features, hidden_size, hidden_layers):
        super().__init__()

        if hidden_layers == 0:
            # Input layer
            layers = [nn.Linear(in_features=in_features, out_features=out_features)]
        else:
            # Input layer
            layers = [nn.Linear(in_features=in_features, out_features=hidden_size), nn.ReLU()]
            # Hidden layers
            for i in range(hidden_layers - 2):
                layers += [nn.Linear(in_features=hidden_size, out_features=hidden_size), nn.ReLU()]
            # Output layer
            layers += [nn.Linear(in_features=hidden_size, out_features=out_features)]

        # Store in layers as ModuleList
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [80]:
#| export
class DeepAR(BaseWindows):
    """ DeepAR

    DeepAR is a method for producing probabilistic forecasts. It uses two recurrent neural networks (RNN) 
    to encode temporal and static variables and generate forecast distribution parameters.

    **Parameters:**<br>

    **References**<br>
    - [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). "DeepAR: Probabilistic forecasting with autoregressive recurrent networks". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>
    - [Alexander Alexandrov et. al (2020). "GluonTS: Probabilistic and Neural Time Series Modeling in Python". Journal of Machine Learning Research.](https://www.jmlr.org/papers/v21/19-820.html)<br>
    """
    # Class attributes
    SAMPLING_TYPE = 'windows'
    
    def __init__(self,
                 h,
                 input_size: int = -1,
                 lstm_n_layers: int=2,
                 lstm_hidden_size: int = 128,
                 lstm_dropout: float = 0.1,
                 decoder_hidden_layers: int = 0,
                 decoder_hidden_size: int = 0,
                 trayectory_samples: int = 100,
                 futr_exog_list = None,
                 stat_exog_list = None,
                 exclude_insample_y = False,
                 loss = MAE(),
                 valid_loss = None,
                 max_steps: int = 1000,
                 learning_rate: float = 1e-3,
                 num_lr_decays: int = 3,
                 early_stop_patience_steps: int =-1,
                 val_check_steps: int = 100,
                 batch_size: int = 32,
                 valid_batch_size: Optional[int] = None,
                 windows_batch_size: int = 1024,
                 inference_windows_batch_size: int = -1,
                 step_size: int = 1,
                 scaler_type: str = 'identity',
                 random_seed: int = 1,
                 num_workers_loader = 0,
                 drop_last_loader = False,
                 **trainer_kwargs):

        # DeepAR doues not support historic exogenous variables 
        hist_exog_list = None,

        # Inherit BaseWindows class
        super(DeepAR, self).__init__(h=h,
                                    input_size=input_size,
                                    futr_exog_list=futr_exog_list,
                                    hist_exog_list=hist_exog_list,
                                    stat_exog_list=stat_exog_list,
                                    exclude_insample_y = exclude_insample_y,
                                    loss=loss,
                                    valid_loss=valid_loss,
                                    max_steps=max_steps,
                                    learning_rate=learning_rate,
                                    num_lr_decays=num_lr_decays,
                                    early_stop_patience_steps=early_stop_patience_steps,
                                    val_check_steps=val_check_steps,
                                    batch_size=batch_size,
                                    windows_batch_size=windows_batch_size,
                                    valid_batch_size=valid_batch_size,
                                    inference_windows_batch_size=inference_windows_batch_size,
                                    step_size=step_size,
                                    scaler_type=scaler_type,
                                    num_workers_loader=num_workers_loader,
                                    drop_last_loader=drop_last_loader,
                                    random_seed=random_seed,
                                    **trainer_kwargs)

        self.horizon_backup = self.h # Used because h=0 during training
        self.trayectory_samples = trayectory_samples

        # LSTM
        self.encoder_n_layers = lstm_n_layers
        self.encoder_hidden_size = lstm_hidden_size
        self.encoder_dropout = lstm_dropout

        self.futr_exog_size = len(self.futr_exog_list)
        self.hist_exog_size = 0
        self.stat_exog_size = len(self.stat_exog_list)
        
        # LSTM input size (1 for target variable y)
        input_encoder = 1 + self.futr_exog_size + self.stat_exog_size

        # Instantiate model
        self.hist_encoder = nn.LSTM(input_size=input_encoder,
                                    hidden_size=self.encoder_hidden_size,
                                    num_layers=self.encoder_n_layers,
                                    dropout=self.encoder_dropout,
                                    batch_first=True)

        # Decoder MLP
        self.decoder = Decoder(in_features=lstm_hidden_size,
                               out_features=self.loss.outputsize_multiplier,
                               hidden_size=decoder_hidden_size,
                               hidden_layers=decoder_hidden_layers)

    # Override BaseWindows method
    def training_step(self, batch, batch_idx):

        # During training h=0  
        self.h = 0     

        # Create and normalize windows [Ws, L+H, C]
        windows = self._create_windows(batch, step='train')
        original_insample_y = windows['temporal'][:, :, 0].clone() # windows: [B, L, Feature] -> [B, L]
        original_insample_y = original_insample_y[:,1:] # Remove first (shift in DeepAr, cell at t outputs t+1)
        windows = self._normalization(windows=windows)

        # Parse windows
        insample_y, insample_mask, _, _, _, futr_exog, stat_exog = self._parse_windows(batch, windows)

        windows_batch = dict(insample_y=insample_y, # [Ws, L]
                             insample_mask=insample_mask, # [Ws, L]
                             futr_exog=futr_exog, # [Ws, L+H]
                             hist_exog=None, # None
                             stat_exog=stat_exog) # [Ws, 1]

        # Model Predictions
        output = self.train_forward(windows_batch)

        if self.loss.is_distribution_output:
            _, y_loc, y_scale = self._inv_normalization(y_hat=original_insample_y,
                                            temporal_cols=batch['temporal_cols'])
            outsample_y = original_insample_y
            distr_args = self.loss.scale_decouple(output=output, loc=y_loc, scale=y_scale)
            mask = insample_mask[:,1:].clone() # Remove first (shift in DeepAr, cell at t outputs t+1)
            loss = self.loss(y=outsample_y, distr_args=distr_args, mask=mask)
        else:
            raise Exception('DeepAR only supports distributional outputs.')

        if torch.isnan(loss):
            print('Model Parameters', self.hparams)
            print('insample_y', torch.isnan(insample_y).sum())
            print('outsample_y', torch.isnan(outsample_y).sum())
            print('output', torch.isnan(output).sum())
            raise Exception('Loss is NaN, training stopped.')

        self.log('train_loss', loss, prog_bar=True, on_epoch=True)
        self.train_trajectories.append((self.global_step, float(loss)))

        self.h = self.horizon_backup # Restore horizon
        return loss

    def predict_step(self, batch, batch_idx):

        # TODO: Hack to compute number of windows
        windows = self._create_windows(batch, step='predict')
        n_windows = len(windows['temporal'])

        # Number of windows in batch
        windows_batch_size = self.inference_windows_batch_size
        if windows_batch_size < 0:
            windows_batch_size = n_windows
        n_batches = int(np.ceil(n_windows/windows_batch_size))

        y_hats = []
        for i in range(n_batches):
            # Create and normalize windows [Ws, L+H, C]
            w_idxs = np.arange(i*windows_batch_size, 
                    min((i+1)*windows_batch_size, n_windows))
            windows = self._create_windows(batch, step='predict', w_idxs=w_idxs)
            windows = self._normalization(windows=windows)

            # Parse windows
            insample_y, insample_mask, _, _, _, futr_exog, stat_exog = self._parse_windows(batch, windows)
            windows_batch = dict(insample_y=insample_y, # [Ws, L]
                                insample_mask=insample_mask, # [Ws, L]
                                futr_exog=futr_exog, # [Ws, L+H]
                                stat_exog=stat_exog,
                                temporal_cols=batch['temporal_cols']) 
            
            # Model Predictions
            y_hat = self(windows_batch)
            # # Inverse normalization and sampling
            # if self.loss.is_distribution_output:
            #     _, y_loc, y_scale = self._inv_normalization(y_hat=output_batch[0],
            #                                     temporal_cols=batch['temporal_cols'])
            #     distr_args = self.loss.scale_decouple(output=output_batch, loc=y_loc, scale=y_scale)
            #     _, sample_mean, quants = self.loss.sample(distr_args=distr_args)
            #     y_hat = torch.concat((sample_mean, quants), axis=2)

            #     if self.loss.return_params:
            #         distr_args = torch.stack(distr_args, dim=-1)
            #         distr_args = torch.reshape(distr_args, (len(windows["temporal"]), self.h, -1))
            #         y_hat = torch.concat((y_hat, distr_args), axis=2)
            # else:
            #     y_hat, _, _ = self._inv_normalization(y_hat=output_batch,
            #                                     temporal_cols=batch['temporal_cols'])
            y_hats.append(y_hat)
        y_hat = torch.cat(y_hats, dim=0)
        return y_hat

    def train_forward(self, windows_batch):

        # Parse windows_batch
        encoder_input = windows_batch['insample_y'][:,:, None] # <- [B,T,1]
        futr_exog  = windows_batch['futr_exog']
        stat_exog  = windows_batch['stat_exog']

        #[B, seq_len, X]
        batch_size, seq_len = encoder_input.shape[:2]
        if self.futr_exog_size > 0:
            encoder_input = torch.cat((encoder_input, futr_exog), dim=2)
        if self.stat_exog_size > 0:
            stat_exog = stat_exog.unsqueeze(1).repeat(1, seq_len, 1) # [B, S] -> [B, seq_len, S]
            encoder_input = torch.cat((encoder_input, stat_exog), dim=2)

        # RNN forward
        hidden_state, _ = self.hist_encoder(encoder_input) # [B, seq_len, rnn_hidden_state]

        # Decoder forward
        output = self.decoder(hidden_state) # [B, seq_len, output_size]
        output = output[:,:-1] # Remove last (shift in DeepAr, last output is outside insample_y)
        output = self.loss.domain_map(output)
        return output
    
    def forward(self, windows_batch):

        # Parse windows_batch
        encoder_input = windows_batch['insample_y'][:,:, None] # <- [B,L,1]
        futr_exog  = windows_batch['futr_exog'] # <- [B,L+H, n_f]
        stat_exog  = windows_batch['stat_exog']
        temporal_cols = windows_batch['temporal_cols']

        #[B, seq_len, X]
        batch_size, seq_len = encoder_input.shape[:2]
        if self.futr_exog_size > 0:
            futr_exog_input_window = futr_exog[:,:seq_len,:]
            encoder_input = torch.cat((encoder_input, futr_exog_input_window), dim=2)
        if self.stat_exog_size > 0:
            stat_exog = stat_exog.unsqueeze(1).repeat(1, seq_len, 1) # [B, S] -> [B, seq_len, S]
            encoder_input = torch.cat((encoder_input, stat_exog), dim=2)

        # Use input_size history to predict first h of the forecasting window
        _, h_c_tuple = self.hist_encoder(encoder_input)
        h_n = h_c_tuple[0] # [n_layers, B, rnn_hidden_state]
        c_n = h_c_tuple[1] # [n_layers, B, rnn_hidden_state]

        # Vectorizes trayectory samples in batch dimension [1]
        h_n = torch.repeat_interleave(h_n, self.trayectory_samples, 1) # [n_layers, B*n_samples, rnn_hidden_state]
        c_n = torch.repeat_interleave(c_n, self.trayectory_samples, 1) # [n_layers, B*n_samples, rnn_hidden_state]
        last_layer_h = h_n[-1] # [B, rnn_hidden_state]
        print('h_c_tuple', h_n.shape, c_n.shape)
        print('last_h', last_layer_h.shape)

        # Scales for inverse normalization
        y_scale = self.scaler.x_scale[:,0,temporal_cols.get_indexer(['y'])].squeeze(-1)
        y_loc = self.scaler.x_shift[:,0,temporal_cols.get_indexer(['y'])].squeeze(-1)
        y_scale = torch.repeat_interleave(y_scale, self.trayectory_samples, 0)
        y_loc = torch.repeat_interleave(y_loc, self.trayectory_samples, 0)
        print('y_scale', y_scale.shape)
        print('y_loc', y_loc.shape)
        print('y_scale', y_scale)
        print('y_loc', y_loc)
        
        # Recursive strategy prediction
        samples = torch.zeros(batch_size, self.h, self.trayectory_samples)
        for tau in range(self.h):
            # Decoder forward
            output = self.decoder(last_layer_h) 
            output = self.loss.domain_map(output)
            print('output[0]', output[0].shape)
            print('output[1]', output[1].shape)

            # Inverse normalization
            distr_args = self.loss.scale_decouple(output=output, loc=y_loc, scale=y_scale)
            # Add horizon (1) dimension
            distr_args = list(distr_args)
            for i in range(len(distr_args)):
                distr_args[i] = distr_args[i].unsqueeze(-1)
            distr_args = tuple(distr_args)
            print('distr_args[0].shape', distr_args[0].shape)
            print('distr_args[1].shape', distr_args[1].shape)
            # Assuming normal for now
            samples_tau, _, _ = self.loss.sample(distr_args=distr_args, num_samples=1)
            samples[:,tau,:] = samples_tau.reshape(batch_size, self.trayectory_samples)
            print('samples_tau.flatten()', samples_tau.flatten())
            print(samples.mean(dim=-1))
            assert 1<0, 'STOP'
            #y_hat = torch.concat((sample_mean, quants), axis=2)

            # if self.loss.return_params:
            #     distr_args = torch.stack(distr_args, dim=-1)
            #     distr_args = torch.reshape(distr_args, (len(windows["temporal"]), self.h, -1))
            #     y_hat = torch.concat((y_hat, distr_args), axis=2)

        return output

In [81]:
show_doc(DeepAR, title_level=3)

---

[source](https://github.com/Nixtla/neuralforecast/blob/main/neuralforecast/models/deepar.py#L60){target="_blank" style="float:right; font-size:smaller"}

### DeepAR

>      DeepAR (h, input_size:int=-1, lstm_n_layers:int=2,
>              lstm_hidden_size:int=128, lstm_dropout:float=0.1,
>              decoder_hidden_layers:int=0, decoder_hidden_size:int=0,
>              trayectory_samples:int=100, futr_exog_list=None,
>              stat_exog_list=None, exclude_insample_y=False, loss=MAE(),
>              valid_loss=None, max_steps:int=1000, learning_rate:float=0.001,
>              num_lr_decays:int=3, early_stop_patience_steps:int=-1,
>              val_check_steps:int=100, batch_size:int=32,
>              valid_batch_size:Optional[int]=None, windows_batch_size:int=1024,
>              inference_windows_batch_size:int=-1, step_size:int=1,
>              scaler_type:str='identity', random_seed:int=1,
>              num_workers_loader=0, drop_last_loader=False, **trainer_kwargs)

DeepAR

DeepAR is a method for producing probabilistic forecasts. It uses two recurrent neural networks (RNN) 
to encode temporal and static variables and generate forecast distribution parameters.

**Parameters:**<br>

**References**<br>
- [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). "DeepAR: Probabilistic forecasting with autoregressive recurrent networks". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>
- [Alexander Alexandrov et. al (2020). "GluonTS: Probabilistic and Neural Time Series Modeling in Python". Journal of Machine Learning Research.](https://www.jmlr.org/papers/v21/19-820.html)<br>

In [82]:
show_doc(DeepAR.fit, name='DeepAR.fit', title_level=3)

---

### DeepAR.fit

>      DeepAR.fit (dataset, val_size=0, test_size=0, random_seed=None)

Fit.

The `fit` method, optimizes the neural network's weights using the
initialization parameters (`learning_rate`, `windows_batch_size`, ...)
and the `loss` function as defined during the initialization.
Within `fit` we use a PyTorch Lightning `Trainer` that
inherits the initialization's `self.trainer_kwargs`, to customize
its inputs, see [PL's trainer arguments](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).

The method is designed to be compatible with SKLearn-like classes
and in particular to be compatible with the StatsForecast library.

By default the `model` is not saving training checkpoints to protect
disk memory, to get them change `enable_checkpointing=True` in `__init__`.

**Parameters:**<br>
`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).<br>
`val_size`: int, validation size for temporal cross-validation.<br>
`random_seed`: int=None, random_seed for pytorch initializer and numpy generators, overwrites model.__init__'s.<br>
`test_size`: int, test size for temporal cross-validation.<br>

In [83]:
show_doc(DeepAR.predict, name='DeepAR.predict', title_level=3)

---

### DeepAR.predict

>      DeepAR.predict (dataset, test_size=None, step_size=1, random_seed=None,
>                      **data_module_kwargs)

Predict.

Neural network prediction with PL's `Trainer` execution of `predict_step`.

**Parameters:**<br>
`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).<br>
`test_size`: int=None, test size for temporal cross-validation.<br>
`step_size`: int=1, Step size between each window.<br>
`random_seed`: int=None, random_seed for pytorch initializer and numpy generators, overwrites model.__init__'s.<br>
`**data_module_kwargs`: PL's TimeSeriesDataModule args, see [documentation](https://pytorch-lightning.readthedocs.io/en/1.6.1/extensions/datamodules.html#using-a-datamodule).

## Usage Example

In [84]:
from neuralforecast import NeuralForecast
from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, PMM
from neuralforecast.tsdataset import TimeSeriesDataset
from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic

In [85]:
#| eval: false
import pandas as pd
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from neuralforecast import NeuralForecast
#from neuralforecast.models import DeepAR
from neuralforecast.losses.pytorch import DistributionLoss, HuberMQLoss
from neuralforecast.tsdataset import TimeSeriesDataset
from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic

#AirPassengersPanel['y'] = AirPassengersPanel['y'] + 10
Y_train_df = AirPassengersPanel[AirPassengersPanel.ds<AirPassengersPanel['ds'].values[-12]] # 132 train
Y_test_df = AirPassengersPanel[AirPassengersPanel.ds>=AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) # 12 test

nf = NeuralForecast(
    models=[DeepAR(h=12,
                   input_size=48,
                   lstm_n_layers=3,
                   trayectory_samples=100,
                   loss=DistributionLoss(distribution='Normal', level=[80, 90]),
                   learning_rate=0.005,
                   stat_exog_list=['airline1'],
                   futr_exog_list=['y_[lag12]'],
                   max_steps=10,
                   val_check_steps=5,
                   early_stop_patience_steps=-1,
                   scaler_type='standard',
                   enable_progress_bar=True),
    ],
    freq='M'
)
nf.fit(df=Y_train_df, static_df=AirPassengersStatic, val_size=0)
Y_hat_df = nf.predict(futr_df=Y_test_df)

# # Plot quantile predictions
# Y_hat_df = Y_hat_df.reset_index(drop=False).drop(columns=['unique_id','ds'])
# plot_df = pd.concat([Y_test_df, Y_hat_df], axis=1)
# plot_df = pd.concat([Y_train_df, plot_df])

# plot_df = plot_df[plot_df.unique_id=='Airline1'].drop('unique_id', axis=1)
# plt.plot(plot_df['ds'], plot_df['y'], c='black', label='True')
# #plt.plot(plot_df['ds'], plot_df['DeepAR'], c='purple', label='mean')
# plt.plot(plot_df['ds'], plot_df['DeepAR-median'], c='blue', label='median')
# plt.fill_between(x=plot_df['ds'][-12:], 
#                  y1=plot_df['DeepAR-lo-90'][-12:].values, 
#                  y2=plot_df['DeepAR-hi-90'][-12:].values,
#                  alpha=0.4, label='level 90')
# plt.legend()
# plt.grid()
# plt.plot()

Global seed set to 1


Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s, v_num=65, train_loss_step=5.160, train_loss_epoch=5.160]
Predicting DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]h_c_tuple torch.Size([3, 200, 128]) torch.Size([3, 200, 128])
last_h torch.Size([200, 128])
y_scale torch.Size([200])
y_loc torch.Size([200])
y_scale tensor([68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163, 68.1163,
        68.1163, 68.1163, 68.1163, 6

AssertionError: STOP