In [None]:
# default_exp losses.pytorch

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# PyTorch Losses

> The most important train signal is the forecast error, which is the difference between the observed value $y_{\\tau}$ and the prediction $\\hat{y}_{\\tau}$, at time $\\tau$: $$e_{\\tau} = y_{\\tau}-\\hat{y}_{\\tau} \\qquad \\qquad \\tau \\in \\{t+1,\\dots,t+H \\}.$$The train loss summarizes the forecast errors in different train objectives: <br><br> 1. Scale-dependent errors - These metrics are on the same scale as the data. <ul><li>MAELoss</li><li>MSELoss</li><li>RMSELoss</li></ul>   <br> 2. Percentage errors - These metrics are unit-free, suitable for comparisons across series. <ul><li>MAPELoss</li><li>sMAPELoss</li></ul>. <br> 3. Scale-independent errors - These metrics measure the relative improvements versus baselines, the available metric is <ul><li>MASELoss</li><li>RMAELoss</li></ul>   <br>4. Probabilistic errors - These measure absolute deviation non-symmetrically, that produce under/over estimation. <ul><li>QuantileLoss</li><li>MQLoss</li><li>wMQLoss</li></ul>. <br> 5. Other errors - Aditionally, two loss functions related to the M4 competition winner, ESRNN. <ul><li>LevelVariabilityLoss</li><li>SmylLoss (QL+LevelLoss)</li></ul>

In [None]:
#export
import torch as t
import torch.nn as nn

In [None]:
#hide
from IPython.display import Image
WIDTH = 600
HEIGHT = 300

In [None]:
#export
def _divide_no_nan(a: float, b: float) -> float:
    """
    Auxiliary funtion to handle divide by 0
    """
    div = a / b
    div[div != div] = 0.0
    div[div == float('inf')] = 0.0
    return div

# <span style="color:DarkOrange">1. Scale-dependent Errors </span>

## Mean Absolute Error

In [None]:
# export
def MAELoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None) -> t.Tensor:
    """

    Calculates Mean Absolute Error (MAE) between
    y and y_hat. MAE measures the relative prediction
    accuracy of a forecasting method by calculating the
    deviation of the prediction and the true
    value at a given time and averages these devations
    over the length of the series.
    
    $$ \mathrm{MAE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = 
        \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} 
        |y_{\\tau} - \hat{y}_{\\tau}| $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Aactual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.

        Returns
        -------
        mae: tensor (single value).
            Mean absolute error.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    mae = t.abs(y - y_hat) * mask
    mae = t.mean(mae)
    return mae

In [None]:
#hide_input
mae_loss_image = Image(filename='loss_imgs/mae_loss.png', width=WIDTH, height=HEIGHT)
mae_loss_image

## Mean Squared Error

In [None]:
# export
def MSELoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None) -> t.Tensor:
    """

    Calculates Mean Squared Error (MSE) between
    y and y_hat. MSE measures the relative prediction
    accuracy of a forecasting method by calculating the 
    squared deviation of the prediction and the true
    value at a given time, and averages these devations
    over the length of the series.
    
    $$ \mathrm{MSE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = 
        \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} (y_{\\tau} - \hat{y}_{\\tau})^{2} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.

        Returns
        -------
        mse: tensor (single value).
            Mean Squared Error.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    mse = (y - y_hat)**2
    mse = mask * mse
    mse = t.mean(mse)
    return mse

In [None]:
#hide_input
mse_image = Image(filename='loss_imgs/mse_loss.png', width=WIDTH, height=HEIGHT)
mse_image

## Root Mean Squared Error

In [None]:
# export
def RMSELoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None) -> t.Tensor:
    """

    Calculates Root Mean Squared Error (RMSE) between
    y and y_hat. RMSE measures the relative prediction
    accuracy of a forecasting method by calculating the squared deviation
    of the prediction and the observed value at a given time and
    averages these devations over the length of the series.
    Finally the RMSE will be in the same scale
    as the original time series so its comparison with other
    series is possible only if they share a common scale. 
    RMSE has a direct connection to the L2 norm.
    
    $$ \mathrm{RMSE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = 
        \\sqrt{\\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} (y_{\\tau} - \hat{y}_{\\tau})^{2}} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.

        Returns
        -------
        rmse: tensor (single value).
            Root Mean Squared Error.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    rmse = (y - y_hat)**2
    rmse = mask * rmse
    rmse = t.sqrt(t.mean(rmse))
    return rmse

In [None]:
#hide_input
rmse_loss_image = Image(filename='loss_imgs/rmse_loss.png', width=WIDTH, height=HEIGHT)
rmse_loss_image

# <span style="color:DarkOrange">2. Percentage Errors </span>

## Mean Absolute Percentage Error

In [None]:
#export
def MAPELoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None) -> t.Tensor:
    """

    Calculates Mean Absolute Percentage Error (MAPE) between
    y and y_hat. MAPE measures the relative prediction
    accuracy of a forecasting method by calculating the percentual deviation
    of the prediction and the observed value at a given time and
    averages these devations over the length of the series.
    The closer to zero an observed value is, the higher penalty MAPE loss
    assigns to the corresponding error.
    
    $$ \mathrm{MAPE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = 
        \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1}
        \\frac{|y_{\\tau}-\hat{y}_{\\tau}|}{|y_{\\tau}|} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.

        Returns
        -------
        mape: tensor (single value).
            Mean absolute percentage error.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    mask = _divide_no_nan(mask, t.abs(y))
    mape = t.abs(y - y_hat) * mask
    mape = t.mean(mape)
    return mape

In [None]:
#hide_input
mape_image = Image(filename='loss_imgs/mape_loss.png', width=WIDTH, height=HEIGHT)
mape_image

## Symmetric Mean Absolute Percentage Error

In [None]:
# export
def SMAPELoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None) -> t.Tensor:
    """
    
    Calculates Symmetric Mean Absolute Percentage Error (SMAPE) between
    y and y_hat. SMAPE measures the relative prediction
    accuracy of a forecasting method by calculating the relative deviation
    of the prediction and the observed value scaled by the sum of the
    absolute values for the prediction and observed value at a
    given time, then averages these devations over the length
    of the series. This allows the SMAPE to have bounds between
    0% and 200% which is desireble compared to normal MAPE that
    may be undetermined when the target is zero.
    
    $$ \mathrm{sMAPE}_{2}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = 
       \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} 
       \\frac{|y_{\\tau}-\hat{y}_{\\tau}|}{|y_{\\tau}|+|\hat{y}_{\\tau}|} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.

        Returns
        -------
        smape: tensor (single value).
            Symmetric mean absolute percentage error.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    delta_y = t.abs((y - y_hat))
    scale = t.abs(y) + t.abs(y_hat)
    smape = _divide_no_nan(delta_y, scale)
    smape = smape * mask
    smape = 200 * t.mean(smape)
    return smape

# <span style="color:DarkOrange">3. Scale-independent Errors </span>

## Mean Absolute Scaled Error

In [None]:
# export
def MASELoss(y: t.Tensor, y_hat: t.Tensor, y_insample: t.Tensor, 
                seasonality: int, mask: t.Tensor =None) -> t.Tensor:
    """ 

    Calculates the Mean Absolute Scaled Error (MASE) between
    y and y_hat. MASE measures the relative prediction
    accuracy of a forecasting method by comparinng the mean absolute errors
    of the prediction and the observed value against the mean
    absolute errors of the seasonal naive model.
    The MASE partially composed the Overall Weighted Average (OWA), 
    used in the M4 Competition.
    
    $$ \mathrm{MASE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}, \\mathbf{\hat{y}}^{season}_{\\tau}) = 
        \\frac{1}{H} \sum^{t+H}_{\\tau=t+1} \\frac{|y_{\\tau}-\hat{y}_{\\tau}|}{\mathrm{MAE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{season}_{\\tau})} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        y_insample: tensor (batch_size, input_size). 
            Actual insample Seasonal Naive predictions.
        seasonality: int.
            Main frequency of the time series;
            Hourly 24,  Daily 7, Weekly 52,
            Monthly 12, Quarterly 4, Yearly 1.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.
            
        Returns
        -------
        mase: tensor (single value).
            Mean absolute scaled error.
            
        References
        ----------
        [1] https://robjhyndman.com/papers/mase.pdf
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    delta_y = t.abs(y - y_hat)
    scale = t.mean(t.abs(y_insample[:, seasonality:] - \
                            y_insample[:, :-seasonality]), axis=1)
    mase = _divide_no_nan(delta_y, scale[:, None])
    mase = mase * mask
    mase = t.mean(mase)
    return mase 
    #remove this later 

In [None]:
#hide_input
mase_loss_image = Image(filename='loss_imgs/mase_loss.png', width=WIDTH, height=HEIGHT)
mase_loss_image

## Relative Mean Absolute Error

In [None]:
#export 
def RMAELoss(y: t.Tensor, y_hat1: t.Tensor, y_hat2: t.Tensor, mask: t.Tensor =None) -> t.tensor:
    """

    Calculates Relative Mean Absolute Error (RMAE) between
    two sets of forecasts (from two different forecasting methods).
    A number smaller than one implies that the forecast in the
    numerator is better than the forecast in the denominator.

    $$ \mathrm{RMAE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}, \\mathbf{\hat{y}}^{base}_{\\tau}) =
        \\frac{1}{H} \sum^{t+H}_{\\tau=t+1} \\frac{|y_{\\tau}-\hat{y}_{\\tau}|}{\mathrm{MAE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{base}_{\\tau})} $$

        Parameters
        ----------
        y: tensor (batch_size, output_size). 
            Actual values in torch tensor.
        y_hat1: tensor (batch_size, output_size). 
            Predicted values in torch tensor. 
        y_hat2: tensor (batch_size, output_size). 
            Predicted values in torch tensor. 
        mask: tensor (batch_size, output_size).
            Specifies date stamps per series 
            to consider in loss. 

        Returns
        -------
        rmae: tensor (single value).
            Relative Mean Absolute Error. 
    """
    
    if mask is None: mask = t.ones_like(y_hat1)
    
    numerator = MAELoss(y=y, y_hat=y_hat1)
    denominator = MAELoss(y=y, y_hat=y_hat2)
    rmae_loss = _divide_no_nan(numerator, denominator)

    return rmae_loss

In [None]:
#hide_input
rmae_loss_image = Image(filename='loss_imgs/rmae_loss.png', width=WIDTH, height=HEIGHT)
rmae_loss_image

# <span style="color:DarkOrange">4. Probabilistic Errors </span>

## Quantile Loss

In [None]:
# export
def QuantileLoss(y: t.Tensor, y_hat: t.Tensor, mask: t.Tensor =None, 
                    q: float =0.5) -> t.Tensor:
    """
    
    Computes the quantile loss (QL) between y and y_hat. 
    QL measures the deviation of a quantile forecast.
    By weighting the absolute deviation in a non symmetric way, the
    loss pays more attention to under or over estimation.
    A common value for q is 0.5 for the deviation from the median (Pinball loss).

    $$ \mathrm{QL}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{(q)}_{\\tau}) = 
        \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} 
        \Big( (1-q)\,( \hat{y}^{(q)}_{\\tau} - y_{\\tau} )_{+} 
        + q\,( y_{\\tau} - \hat{y}^{(q)}_{\\tau} )_{+} \Big) $$
            
        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie
            to consider in loss.
        q: float, between 0 and 1. 
            The slope of the quantile loss, in the context of 
            quantile regression, the q determines the conditional 
            quantile level.

        Returns
        -------
        quantile_loss: tensor (single value).
            Average quantile loss.
    """
    if mask is None: mask = t.ones_like(y_hat)
        
    delta_y = t.sub(y, y_hat)
    loss = t.max(t.mul(q, delta_y), t.mul((q - 1), delta_y))
    loss = loss * mask
    quantile_loss = t.mean(loss)
    return quantile_loss

In [None]:
#hide_input
qloss_image = Image(filename='loss_imgs/q_loss.png', width=WIDTH, height=HEIGHT)
qloss_image

## Multi-Quantile Loss

In [None]:
#export
def MQLoss(y: t.Tensor, y_hat: t.Tensor, quantiles: t.Tensor, 
            mask: t.Tensor =None) -> t.Tensor: 
    """
    
    Calculates the Multi-Quantile loss (MQL) between y and y_hat. 
    MQL calculates the average multi-quantile Loss for
    a given set of quantiles, based on the absolute 
    difference between predicted quantiles and observed values.
    
    $$ \mathrm{MQL}(\\mathbf{y}_{\\tau},
                    [\\mathbf{\hat{y}}^{(q_{1})}_{\\tau}, ... ,\hat{y}^{(q_{n})}_{\\tau}]) = 
       \\frac{1}{n} \\sum_{q_{i}} \mathrm{QL}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{(q_{i})}_{\\tau}) $$
    
    The limit behavior of MQL allows to measure the accuracy 
    of a full predictive distribution $\mathbf{\hat{F}}_{\\tau}$ with 
    the continuous ranked probability score (CRPS). This can be achieved 
    through a numerical integration technique, that discretizes the quantiles 
    and treats the CRPS integral with a left Riemann approximation, averaging over 
    uniformly distanced quantiles.    
    
    $$ \mathrm{CRPS}(y_{\\tau}, \mathbf{\hat{F}}_{\\tau}) = 
        \int^{1}_{0} \mathrm{QL}(y_{\\tau}, \hat{y}^{(q)}_{\\tau}) dq $$        
        
        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie to consider in loss.
        quantiles: tensor(n_quantiles). 
            Quantiles to estimate from the distribution of y.

        Returns
        -------
        mqloss: tensor(n_quantiles).
            Average multi-quantile loss.
            
        References
        ----------
        [1] https://www.jstor.org/stable/2629907
    """    
    assert len(quantiles) > 1, f'your quantiles are of len: {len(quantiles)}'
    
    if mask is None: mask = t.ones_like(y)

    n_q = len(quantiles)
    
    error  = y_hat - y.unsqueeze(-1)
    sq     = t.maximum(-error, t.zeros_like(error))
    s1_q   = t.maximum(error, t.zeros_like(error))
    mqloss = (quantiles * sq + (1 - quantiles) * s1_q)
        
    # Match y/weights dimensions and compute weighted average
    mask = mask / t.sum(mask)
    mask = mask.unsqueeze(-1)
    mqloss = (1/n_q) * mqloss * mask

    return t.sum(mqloss)

In [None]:
#hide_input
mqloss_image = Image(filename='loss_imgs/mq_loss.png', width=WIDTH, height=HEIGHT)
mqloss_image

## Weighted Multi-Quantile Loss

In [None]:
#export
def wMQLoss(y: t.Tensor, y_hat: t.Tensor, quantiles: t.Tensor, 
            mask: t.Tensor =None) -> t.Tensor: 
    """        
        
    Calculates the Weighted Multi-Quantile loss (WMQL) between y and y_hat. 
    WMQL calculates the weighted average multi-quantile Loss for
    a given set of quantiles, based on the absolute 
    difference between predicted quantiles and observed values.  
        
    $$ \mathrm{WMQL}(\\mathbf{y}_{\\tau},
                    [\\mathbf{\hat{y}}^{(q_{1})}_{\\tau}, ... ,\hat{y}^{(q_{n})}_{\\tau}]) = 
       \\frac{1}{n} \\sum_{q_{i}} 
           \\frac{\mathrm{QL}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{(q_{i})}_{\\tau})}
            {\\sum^{t+H}_{\\tau=t+1} |y_{\\tau}|} $$
        
        Parameters
        ----------
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie to consider in loss.
        quantiles: tensor(n_quantiles). 
            Quantiles to estimate from the distribution of y.

        Returns
        -------
        wmqloss: tensor(n_quantiles).
            Weighted average multi-quantile loss.
    """    
    assert len(quantiles) > 1, f'your quantiles are of len: {len(quantiles)}'
    
    if mask is None: mask = t.ones_like(y_hat)
    
    n_q = len(quantiles)
    
    error = y_hat - y.unsqueeze(-1)
    
    sq = t.maximum(-error, t.zeros_like(error))
    s1_q = t.maximum(error, t.zeros_like(error))
    loss = (quantiles * sq + (1 - quantiles) * s1_q)
    
    wmqloss = _divide_no_nan(t.sum(loss * mask, axis=-2), 
                          t.sum(t.abs(y.unsqueeze(-1)) * mask, axis=-2))
    
    return t.mean(wmqloss)

# <span style="color:DarkOrange">5. Other Errors </span>

## ES-RNN PyTorch Loss

The M4 competition winner, the exponential smoothing recurrent neural network
combines the Holt-Winter method for the seasonal and levels 
with a dilated Recurrent neural network is defined by:

* $\text{Level:} \quad l_{\tau} = \text{median}(y_{\tau})$
* $\text{Residual:} \quad z_{\tau} = y_{\tau} - l_{\tau}$
* $\text{NN Forecast:} \quad \hat{z}_{\tau} = \text{DRNN}(z_{\tau}, x_{\tau}, s_{\tau})$
* $\text{Level Forecast:} \quad \hat{l}_{\tau} = \text{Naive}(l_{\tau})$
* $\text{Forecast:} \quad \hat{y}_{\tau+H} = \hat{l}_{\tau+H} + \hat{z}_{\tau+h}$

In [None]:
#export
def LevelVariabilityLoss(levels: t.Tensor, level_variability_penalty: float) -> t.Tensor:
    """
    Computes the variability penalty for the level of the ES-RNN.
    The levels of the ES-RNN are based on the Holt-Winters model.
    
    $$  Penalty = \lambda * (\hat{l}_{τ+1}-\hat{l}_{τ})^{2} $$

        Parameters
        ----------
        levels: tensor with shape (batch, n_time).
            Levels obtained from exponential smoothing component of ESRNN.
        level_variability_penalty: float.
            This parameter controls the strength of the penalization 
            to the wigglines of the level vector, induces smoothness
            in the output.

        Returns
        ----------
        level_var_loss: tensor (single value).
            Wiggliness loss for the level vector.
    """
    assert levels.shape[1] > 2
    level_prev = t.log(levels[:, :-1])
    level_next = t.log(levels[:, 1:])
    log_diff_of_levels = t.sub(level_prev, level_next)

    log_diff_prev = log_diff_of_levels[:, :-1]
    log_diff_next = log_diff_of_levels[:, 1:]
    diff = t.sub(log_diff_prev, log_diff_next)
    level_var_loss = diff**2
    level_var_loss = level_var_loss.mean() * level_variability_penalty
    
    return level_var_loss

In [None]:
#export
def SmylLoss(y: t.Tensor, y_hat: t.Tensor, levels: t.Tensor, 
                mask: t.Tensor, tau: float, level_variability_penalty: float =0.0) -> t.Tensor:
    """
    
    Computes the Smyl Loss that combines level 
    variability regularization with with Quantile loss.
    
        Parameters
        ----------    
        y: tensor (batch_size, output_size).
            Actual values in torch tensor.
        y_hat: tensor (batch_size, output_size).
            Predicted values in torch tensor.
        levels: tensor with shape (batch, n_time).
            Levels obtained from exponential smoothing component of ESRNN.
        mask: tensor (batch_size, output_size).
            Specifies date stamps per serie to consider in loss.
        tau: float, between 0 and 1. 
            The slope of the quantile loss, in the context of 
            quantile regression, the q determines the conditional 
            quantile level.
        level_variability_penalty: float.
            This parameter controls the strength of the penalization 
            to the wigglines of the level vector, induces smoothness
            in the output.
                           
        Returns
        ----------
        smyl_loss: tensor (single value).
            Smyl loss.
    """
    
    if mask is None: mask = t.ones_like(y_hat)
        
    smyl_loss = QuantileLoss(y, y_hat, mask, tau)
    
    if level_variability_penalty > 0:
        log_diff_of_levels = LevelVariabilityLoss(levels, level_variability_penalty) 
        smyl_loss += log_diff_of_levels
    
    return smyl_loss

In [None]:
#hide
# Checks for PyTorch train losses

In [None]:
#hide
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

import time
from scipy.stats import hmean
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#hide
class Model(nn.Module):  

    def __init__(self, horizon, n_quantiles):
        super(Model, self).__init__()
        self.horizon = horizon
        self.n_quantiles = n_quantiles
        self.linear_layer = nn.Linear(in_features=n_obs, 
                                      out_features=horizon * n_quantiles, 
                                      bias=False)

    def forward(self, x):
        y_hat = self.linear_layer(x)
        y_hat = y_hat.view(-1, self.horizon, self.n_quantiles)
        return y_hat
    
class Data(Dataset):
    
    # Constructor
    def __init__(self, Y, X):
        self.X = X
        self.Y = Y
        self.len = Y.shape[0]

    # Getter
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    
    # Get Length
    def __len__(self):
        return self.len

In [None]:
#hide
# Hyperparameters and sample data parameters
t.cuda.manual_seed(7)

# Sample data
n_ts = 1000
n_obs = horizon = 10
mean = 0.0 # to generate random numbers from N(mean, std)
std = 7.0 # to generate random numbers from N(mean, std)
start = 0.05 # First quantile
end = 0.95 # Last quantiles
steps = 4 # Number of quantiles

# Hyperparameters
batch_size = 500
lr = 0.08
epochs = 100

# Sample data
quantiles = t.Tensor([0.0500, 0.3500, 0.6500, 0.9500])
print(f'quantiles:\n{quantiles}')
Y = t.normal(mean=mean, std=std, size=(n_ts, n_obs))
X = t.ones(size=(n_ts, n_obs))

Y_test = t.normal(mean=mean, std=std, size=(n_ts, horizon))
X_test = t.ones(size=(n_ts, horizon))
print(f'Y.shape: {Y.shape}, X.shape: {X.shape}')
print(f'Y_test.shape: {Y_test.shape}, X_test.shape: {X_test.shape}')

In [None]:
#hide
# Model training 
model = Model(horizon=horizon, n_quantiles=len(quantiles))
dataset = Data(X=X, Y=Y)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size)
optimizer = optim.Adam(model.parameters(), lr=lr)

def train_model(model, epochs, print_progress=False):

    start = time.time()
    i = 0 
    training_trajectory = {'epoch': [],
                           'train_loss': []}
    
    for epoch in range(epochs):
        for x, y in dataloader:
            
            i += 1
            y_hat = model(x)
            #training_loss = wMQLoss(y=y, y_hat=y_hat, quantiles=quantiles)
            training_loss = MQLoss(y=y, y_hat=y_hat, quantiles=quantiles)
            if i % (epoch + 1) == 0: 
                training_trajectory['epoch'].append(i)
                training_trajectory['train_loss'].append(training_loss.detach().numpy())
            optimizer.zero_grad()
            training_loss.backward()
            optimizer.step()

            display_string = 'Step: {}, Time: {:03.3f}, Insample {}: {:.5f}'.format(i, 
                                                                                    time.time()-start, 
                                                                                    "MQLoss", 
                                                                                    training_loss.cpu().data.numpy())
            if print_progress: print(display_string)

    return model, training_trajectory