In [None]:
# default_exp models.transformer.transformer

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Transformer

> API details.

In [None]:
#export
import random
from fastcore.foundation import patch

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch import optim

from neuralforecast.models.components.transformer import Decoder, DecoderLayer, Encoder, EncoderLayer
from neuralforecast.models.components.selfattention import FullAttention, AttentionLayer
from neuralforecast.models.components.embed import DataEmbedding
from neuralforecast.losses.utils import LossFunction
from neuralforecast.data.tsdataset import IterateWindowsDataset
from neuralforecast.data.tsloader import TimeSeriesLoader

In [None]:
#export
class _Transformer(nn.Module):
    """
    Vanilla Transformer with O(L^2) complexity
    """
    def __init__(self, pred_len, output_attention,
                 enc_in, dec_in, d_model, c_out, embed, freq, dropout,
                 factor, n_heads, d_ff, activation, e_layers,
                 d_layers):
        super(_Transformer, self).__init__()
        self.pred_len = pred_len
        self.output_attention = output_attention

        # Embedding
        self.enc_embedding = DataEmbedding(enc_in, d_model, embed, freq,
                                           dropout)
        self.dec_embedding = DataEmbedding(dec_in, d_model, embed, freq,
                                           dropout)
        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        FullAttention(False, factor, attention_dropout=dropout,
                                      output_attention=output_attention), d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            norm_layer=torch.nn.LayerNorm(d_model)
        )
        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(
                        FullAttention(True, factor, attention_dropout=dropout, output_attention=False),
                        d_model, n_heads),
                    AttentionLayer(
                        FullAttention(False, factor, attention_dropout=dropout, output_attention=False),
                        d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation,
                )
                for l in range(d_layers)
            ],
            norm_layer=torch.nn.LayerNorm(d_model),
            projection=nn.Linear(d_model, c_out, bias=True)
        )

    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):

        enc_out = self.enc_embedding(x_enc, x_mark_enc)
        enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)

        dec_out = self.dec_embedding(x_dec, x_mark_dec)
        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)

        if self.output_attention:
            return dec_out[:, -self.pred_len:, :], attns
        else:
            return dec_out[:, -self.pred_len:, :]  # [B, L, D]

# Transfomer model wrapper

In [None]:
#export
class Transformer(pl.LightningModule):
    def __init__(self, seq_len: int, 
                 label_len: int, pred_len: int, output_attention: bool,
                 enc_in: int, dec_in: int, d_model: int, c_out: int, 
                 embed: str, freq: str, dropout: float, factor: float, 
                 n_heads: int, d_ff: int, activation: str, 
                 e_layers: int, d_layers: int,
                 loss_train: str, loss_valid: str, loss_hypar: float, 
                 learning_rate: float, lr_decay: float, weight_decay: float, 
                 lr_decay_step_size: int, random_seed: int):
        super(Transformer, self).__init__()
        """
        Vanilla Transformer model.

        Parameters
        ----------
        seq_len: int
            Input sequence size.
        label_len: int
            Label sequence size.
        pred_len: int
            Prediction sequence size.
        output_attention: bool
            If true use output attention for Transformer model.
        enc_in: int
            Number of encoders in data embedding layers.
        dec_in: int
            Number of decoders in data embedding layers.
        d_model: int
            Number of nodes for embedding layers.
        c_out: int
            Number of output nodes in projection layer.
        embed: str
            Type of embedding layers.
        freq: str
            Frequency for embedding layers.
        dropout: float
            Float between (0, 1). Dropout for Transformer.
        factor: float
            Factor for attention layer.
        n_heads: int
            Number of heads in attention layer.
        d_ff: int
            Number of inputs in encoder layers.
        activation: str
            Activation function for encoder layer.
        e_layers: int
            Number of encoder layers.
        d_layers: int
            Number of decoder layers.
        loss_train: str
            Loss to optimize.
            An item from ['MAPE', 'MASE', 'SMAPE', 'MSE', 'MAE', 'QUANTILE', 'QUANTILE2'].
        loss_valid: str
            Validation loss.
            An item from ['MAPE', 'MASE', 'SMAPE', 'RMSE', 'MAE', 'QUANTILE'].
        loss_hypar: float
            Hyperparameter for chosen loss.
        learning_rate: float
            Learning rate between (0, 1).
        lr_decay: float
            Decreasing multiplier for the learning rate.
        weight_decay: float
            L2 penalty for optimizer.
        lr_decay_step_size: int 
            Steps between each learning rate decay.
        random_seed: int
            random_seed for pseudo random pytorch initializer and
            numpy random generator.
        """

        #------------------------ Model Attributes ------------------------#
        # Architecture parameters
        self.seq_len = seq_len 
        self.label_len = label_len 
        self.pred_len = pred_len 
        self.output_attention = output_attention
        self.enc_in = enc_in 
        self.dec_in = dec_in 
        self.d_model = d_model 
        self.c_out = c_out 
        self.embed = embed
        self.freq = freq 
        self.dropout = dropout
        self.factor = factor 
        self.n_heads = n_heads 
        self.d_ff = d_ff 
        self.activation = activation 
        self.e_layers = e_layers
        self.d_layers = d_layers
        
        # Loss functions
        self.loss_train = loss_train
        self.loss_hypar = loss_hypar
        self.loss_valid = loss_valid
        self.loss_fn_train = LossFunction(loss_train, 
                                          seasonality=self.loss_hypar)
        self.loss_fn_valid = LossFunction(loss_valid,
                                          seasonality=self.loss_hypar)
        
        # Regularization and optimization parameters      
        self.learning_rate = learning_rate
        self.lr_decay = lr_decay
        self.weight_decay = weight_decay
        self.lr_decay_step_size = lr_decay_step_size
        self.random_seed = random_seed

        self.model = _Transformer(pred_len, output_attention,
                                  enc_in, dec_in, d_model, c_out, 
                                  embed, freq, dropout,
                                  factor, n_heads, d_ff, 
                                  activation, e_layers,
                                  d_layers)
    
    def forward(self, batch):
        """
        Autoformer needs batch of shape (batch_size, time, series) for y
        and (batch_size, time, exogenous) for x
        and doesnt need X for each time series.
        USE DataLoader from pytorch instead of TimeSeriesLoader.
        """
        
        # Protection for missing batch_size dimension
        if batch['Y'].dim()<3:
            batch['Y'] = batch['Y'][None,:,:]

        if batch['X'] is not None:
            if batch['X'].dim()<4:
                batch['X'] = batch['X'][None,:,:,:]
        
        if batch['sample_mask'].dim()<3:
            batch['sample_mask'] = batch['sample_mask'][None,:,:]

        Y = batch['Y'].permute(0, 2, 1)
        X = batch['X'][:, 0, :, :].permute(0, 2, 1)
        sample_mask = batch['sample_mask'].permute(0, 2, 1)
        available_mask = batch['available_mask']
        
        s_begin = 0
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len
        
        batch_x = Y[:, s_begin:s_end, :]
        batch_y = Y[:, r_begin:r_end, :]
        batch_x_mark = X[:, s_begin:s_end, :]
        batch_y_mark = X[:, r_begin:r_end, :]
        outsample_mask = sample_mask[:, r_begin:r_end, :]
        
        dec_inp = torch.zeros_like(batch_y[:, -self.pred_len:, :])
        dec_inp = torch.cat([batch_y[:, :self.label_len, :], dec_inp], dim=1)
        
        if self.output_attention:
            forecast = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
        else:
            forecast = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
            
        batch_y = batch_y[:, -self.pred_len:, :]
        outsample_mask = outsample_mask[:, -self.pred_len:, :]

        return batch_y, forecast, outsample_mask, Y
    
    def training_step(self, batch, batch_idx):
        
        # Protection for missing batch_size dimension
        if batch['Y'].dim()<3:
            batch['Y'] = batch['Y'][None,:,:]

        outsample_y, forecast, outsample_mask, Y = self(batch)

        loss = self.loss_fn_train(y=outsample_y,
                                  y_hat=forecast,
                                  mask=outsample_mask,
                                  y_insample=Y)

        self.log('train_loss', loss, prog_bar=True, on_epoch=True)

        return loss

    def validation_step(self, batch, idx):
        
        # Protection for missing batch_size dimension
        if batch['Y'].dim()<3:
            batch['Y'] = batch['Y'][None,:,:]

        outsample_y, forecast, outsample_mask, Y = self(batch)

        loss = self.loss_fn_valid(y=outsample_y,
                                  y_hat=forecast,
                                  mask=outsample_mask,
                                  y_insample=Y)

        self.log('val_loss', loss, prog_bar=True)
        
        return loss

    def on_fit_start(self):
        torch.manual_seed(self.random_seed)
        np.random.seed(self.random_seed)
        random.seed(self.random_seed)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(),
                               lr=self.learning_rate, 
                               weight_decay=self.weight_decay)
        
        lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                                 step_size=self.lr_decay_step_size, 
                                                 gamma=self.lr_decay)

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler}

In [None]:
#export
@patch
def forecast(self: Transformer, Y_df: pd.DataFrame, X_df: pd.DataFrame = None, 
                S_df: pd.DataFrame = None, trainer: pl.Trainer =None) -> pd.DataFrame:
    """
    Method for forecasting self.n_time_out periods after last timestamp of Y_df.

    Parameters
    ----------
    Y_df: pd.DataFrame
        Dataframe with target time-series data, needs 'unique_id','ds' and 'y' columns.
    X_df: pd.DataFrame
        Dataframe with exogenous time-series data, needs 'unique_id' and 'ds' columns.
        Note that 'unique_id' and 'ds' must match Y_df plus the forecasting horizon.
    S_df: pd.DataFrame
        Dataframe with static data, needs 'unique_id' column.
    bath_size: int
        Batch size for forecasting.
    trainer: pl.Trainer
        Trainer object for model training and evaluation.

    Returns
    ----------
    forecast_df: pd.DataFrame
        Dataframe with forecasts.
    """
    
    # Add forecast dates to Y_df
    Y_df['ds'] = pd.to_datetime(Y_df['ds'])
    if X_df is not None:
        X_df['ds'] = pd.to_datetime(X_df['ds'])
    self.frequency = pd.infer_freq(Y_df[Y_df['unique_id']==Y_df['unique_id'][0]]['ds']) # Infer with first unique_id series

    forecast_dates = pd.date_range(Y_df['ds'].max(), periods=self.pred_len+1, freq=self.frequency)[1:]
    index = pd.MultiIndex.from_product([Y_df['unique_id'].unique(), forecast_dates], names=['unique_id', 'ds'])
    forecast_df = pd.DataFrame({'y':[0]}, index=index).reset_index()

    Y_df = Y_df.append(forecast_df).sort_values(['unique_id','ds']).reset_index(drop=True)
    
    # Dataset, loader and trainer
    dataset = IterateWindowsDataset(S_df=S_df, Y_df=Y_df, X_df=X_df,
                                    mask_df=None, f_cols=[],
                                    input_size=self.seq_len,
                                    output_size=self.pred_len,
                                    ds_in_test=self.pred_len,
                                    is_test=True,
                                    verbose=True)

    loader = TimeSeriesLoader(dataset=dataset,
                                batch_size=1,
                                shuffle=False)

    if trainer is None:
        gpus = -1 if torch.cuda.is_available() else 0
        trainer = pl.Trainer(progress_bar_refresh_rate=1,
                             gpus=gpus,
                             logger=False)

    # Forecast
    outputs = trainer.predict(self, loader)

    # Process forecast and include in forecast_df
    _, forecast, _, _ = [torch.cat(output).cpu().numpy() for output in zip(*outputs)]
    forecast = np.transpose(forecast, (0, 2, 1))
    forecast_df['y'] = forecast.flatten()

    return forecast_df


## Transformer Usage Example

### Load Data

In [None]:
from neuralforecast.data.datasets.long_horizon import LongHorizon

Y_df, X_df, S_df = LongHorizon.load(directory='./data', group='ETTm2')
Y_df = Y_df.reset_index(drop=True)
Y_df.loc[Y_df['unique_id']=='OT','y'] = Y_df[Y_df['unique_id']=='OT']['y'] + 100 #To obseve differences

In [None]:
Y_df.head()

In [None]:
X_df.head()

In [None]:
f_cols = X_df.drop(columns=['unique_id', 'ds']).columns.to_list()

### Declare Model and Data Parameters

In [None]:
# Architecture parameters
mc_model = {}

mc_model['seq_len'] = 96
mc_model['label_len'] = 48
mc_model['pred_len'] = 96
mc_model['output_attention'] = False
mc_model['enc_in'] = 7
mc_model['dec_in'] = 7
mc_model['d_model'] = 512
mc_model['c_out'] = 7
mc_model['embed'] = 'timeF'
mc_model['freq'] = 'h'
mc_model['dropout'] = 0.05
mc_model['factor'] = 1
mc_model['n_heads'] = 8
mc_model['d_ff'] = 2_048
mc_model['activation'] = 'gelu'
mc_model['e_layers'] = 2 
mc_model['d_layers'] = 1
mc_model['loss_train'] = 'MAE'
mc_model['loss_hypar'] = 0.5
mc_model['loss_valid'] = 'MAE'
mc_model['learning_rate'] = 0.001
mc_model['lr_decay'] = 0.5
mc_model['weight_decay'] = 0.
mc_model['lr_decay_step_size'] = 2
mc_model['random_seed'] = 1

# Dataset parameters
mc_data = {}
mc_data['mode'] = 'iterate_windows'
mc_data['n_time_in'] = mc_model['seq_len']
mc_data['n_time_out'] = mc_model['pred_len']
mc_data['batch_size'] = 1
mc_data['scaler'] = None
mc_data['max_epochs'] = None
mc_data['max_steps'] = 10
mc_data['early_stop_patience'] = 20

len_val = 11_520
len_test = 11_520

### Instantiate Loaders and Model

In [None]:
from neuralforecast.data.tsdataset import IterateWindowsDataset

In [None]:
from torch.utils.data import DataLoader
from neuralforecast.experiments.utils import create_datasets

train_dataset, val_dataset, test_dataset = create_datasets(mc=mc_data,
                                                                     S_df=None, 
                                                                     Y_df=Y_df, X_df=X_df,
                                                                     f_cols=f_cols,
                                                                     ds_in_val=len_val,
                                                                     ds_in_test=len_test)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=int(mc_data['batch_size']),
                          shuffle=True,
                          drop_last=True)

val_loader = DataLoader(dataset=val_dataset,
                        batch_size=int(mc_data['batch_size']),
                        shuffle=False)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=int(mc_data['batch_size']),
                         shuffle=False)

In [None]:
model = Transformer(**mc_model)

### Train Model

In [None]:
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', 
                                            min_delta=1e-4, 
                                            patience=mc_data['early_stop_patience'],
                                            verbose=False,
                                            mode="min")

trainer = pl.Trainer(max_epochs=mc_data['max_epochs'], 
                     max_steps=mc_data['max_steps'],
                     gradient_clip_val=1.0,
                     progress_bar_refresh_rate=10, 
                     check_val_every_n_epoch=1,
                     num_sanity_val_steps=1,
                     val_check_interval=1,
                     limit_val_batches=1,
                     callbacks=[early_stopping])

trainer.fit(model, train_loader, val_loader)

### Make Predictions

In [None]:
#outputs = trainer.predict(model, val_loader)

#print("outputs[0][0].shape", outputs[0][0].shape)
#print("outputs[0][1].shape", outputs[0][1].shape)
#print("outputs[0][2].shape", outputs[0][2].shape)

# Forecast

In [None]:
Y_forecast_df = Y_df[Y_df['ds']<'2017-10-24']
Y_forecast_df = Y_forecast_df.reset_index(drop=True)
Y_forecast_df.tail()

In [None]:
X_forecast_df = X_df[X_df['ds']<'2017-10-25']
X_forecast_df = X_forecast_df.reset_index(drop=True)
X_forecast_df['ds'] = pd.to_datetime(X_forecast_df['ds'])
X_forecast_df.tail()

In [None]:
forecast_df = model.forecast(Y_df=Y_forecast_df, X_df=X_forecast_df, S_df=S_df)

In [None]:
forecast_df