This notebook provides a reproducible example of using Optuna with an LSTM model in pytorch. We use the M5 dataset as a test case, training one local model per time series. There are 30,490 unique series in the dataset.

There are between 124 and 1969 observations per series. Over 25,000 series have more than 1000 observations.

In [1]:
import sys
sys.path.append('..')

In [2]:
# Get the m5 dataset
from datasetsforecast.m5 import M5
m5_dataset = M5(source_url='https://github.com/Nixtla/m5-forecasts/raw/main/datasets/m5.zip')
sales, calendar, heirarchy = m5_dataset.load(directory='../data')

In [4]:
import plotly.express as px
import pandas as pd
series_lengths = pd.DataFrame(sales.groupby('unique_id').size(), columns=['length'])
series_lengths = series_lengths.reset_index()
px.histogram(series_lengths, x='length', nbins=30)





In [6]:
# select 5 ids with a bit of variability
uids = sales['unique_id'].sample(5, random_state=4)
y_df = sales.query('unique_id in @uids')

from statsforecast import StatsForecast
StatsForecast.plot(y_df,engine='plotly')

We can leverage utilsforecast to generate a rolling window of data for neural network training off a single series. This is perfect for doing sequence to sequence modelling because it generates an input with a specified size (e.g. 180 days) and a target with a specified size (e.g. 14 days), along with the cutoff date. The cutoff data is the maximum data in the input data (the last date observed) We can use this to generate a large delta table or a pytorch dataset.

In [9]:
from utilsforecast.processing import backtest_splits
out = backtest_splits(
    y_df, 
    n_windows=10, 
    h=14, 
    freq='D', 
    id_col='unique_id', 
    time_col='ds', 
    step_size=2, 
    input_size=180
)

# delta friendly format
from src.utils import combine_backtest_split
combined = pd.concat([combine_backtest_split(x) for x in out])
# We now have 50 windows (10 x 5 series) with a 180 day input and 14 day output
combined[['unique_id','cutoff']].drop_duplicates().shape

(50, 2)

We can then take our Delta friendly format and convert it into a sequence to sequence dataset. This dataset follows the standard pytorch tensor format of (batch, channel, sequence length).

In [81]:
from src.utils import create_seq2seq_dataset
# This can easily be optimized
input, target = create_seq2seq_dataset(combined)
input.shape, target.shape

(torch.Size([50, 1, 180]), torch.Size([50, 1, 14]))

Now that we have an input and target, we can generate an encoder / decoder framework. We are going to use a sequence to seuqence model with 1D convolutions and LSTM cells.

In [123]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(
            self, 
            input_size, 
            lstm_hidden_size=128, 
            lstm_num_layers=1,
            cnv_padding=1,
            cnv_kernel_size=3):
        super(Encoder, self).__init__()
        self.conv1d = nn.Conv1d(input_size, lstm_hidden_size, kernel_size=cnv_kernel_size, padding=cnv_padding)
        self.lstm = nn.LSTM(lstm_hidden_size, lstm_hidden_size, lstm_num_layers, batch_first=True)
    
    def forward(self, x):
        x = self.conv1d(x)
        _, (hidden, cell) = self.lstm(x)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden, cell):
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        prediction = self.linear(output)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_size = target.shape[2]
        
        outputs = torch.zeros(batch_size, target_len, target_size).to(source.device)
        
        hidden, cell = self.encoder(source)
        
        input = source[:, -1:, :]  # Start with the last step of the input sequence
        
        for t in range(target_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t:t+1, :] = output
            
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = target[:, t:t+1, :] if teacher_force else output
        
        return outputs

In [129]:
input_size = input.shape[2]  # Number of features
hidden_size = 128
output_size = target.shape[2]  # Number of target variables

encoder = Encoder(
    input_size, 
    lstm_hidden_size=128, 
    lstm_num_layers=1,
    cnv_hidden_channels=4,
    cnv_padding=1,
    cnv_kernel_size=3
    )
decoder = Decoder(input_size, hidden_size, output_size)
model = Seq2Seq(encoder, decoder)

In [162]:
lstm_hidden_size=128
lstm_layers=1
lstm_activation="silu"
lstm_dropout=0.1
cnv_layers=2
cnv_hidden_channels=4
cnv_kernel_size=3
cnv_padding=1
cnv_activation="silu"
linear_layer_hidden_size: list[int] = [64, 32]
linear_layer_dropout: list[float] = [0.1, 0.5]
linear_layer_activation: list[str] = ["silu", "silu"]

activation_map = {
    'sigmoid': nn.Sigmoid(),
    'tanh': nn.Tanh(),
    'relu': nn.ReLU(),
    'silu': nn.SiLU(),
    'leakyrelu': nn.LeakyReLU(),
    'identity': nn.Identity()
}

assert len(linear_layer_hidden_size) == len(linear_layer_dropout) == len(linear_layer_activation)

layers = nn.Sequential()

for i in range(cnv_layers):
    in_channels = 1 if i == 0 else cnv_hidden_channels
    # add convolution
    layers = layers.append(
        nn.Conv1d(
            in_channels, 
            cnv_hidden_channels, 
            kernel_size=cnv_kernel_size, 
            padding=cnv_padding
            )
        )
    # add activation
    activation = activation_map[cnv_activation]
    layers = layers.append(activation)

# add lstm blocks
layers = layers.append(
    nn.LSTM(
        input_size=input.shape[-1], 
        hidden_size=lstm_hidden_size, 
        num_layers=lstm_layers, 
        batch_first=True,
        dropout=lstm_dropout
        )
    )

# add fully connected layers
for i in range(len(linear_layer_hidden_size)-1):
    layers = layers.append(
        nn.Linear(linear_layer_hidden_size[i], linear_layer_hidden_size[i+1])
        )
    layers = layers.append(
        activation_map[linear_layer_activation[i]]
        )

x = layers(input)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not tuple

In [160]:
x[0].shape

torch.Size([50, 4, 128])

In [125]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("\nTotal trainable parameters:", count_parameters(model))


Total trainable parameters: 361870


In [126]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    output = model(input, target)
    loss = criterion(output, target)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

RuntimeError: Given groups=1, weight of size [128, 180, 3], expected input[50, 1, 180] to have 180 channels, but got 1 channels instead

In [113]:
def predict(model, input):
    model.eval()
    with torch.no_grad():
        hidden, cell = model.encoder(input[0])
        decoder_input = input[:, -1, :]
        decoder_output, hidden, cell = model.decoder(decoder_input, hidden, cell)
        return decoder_output

# Example usage
predict(model, input[3:4])

tensor([[ 8.9407e-08,  1.0000e+00, -5.4948e-08,  3.0000e+00,  4.0000e+00,
          2.0000e+00,  2.0000e+00,  1.1176e-07, -5.9605e-08, -1.5274e-07,
          0.0000e+00, -4.4703e-08,  2.2352e-08,  1.7136e-07]])

In [196]:
import torch
import torch.nn as nn
import torch.optim as optim

class ConvLSTMHybrid(nn.Module):
    def __init__(
            self, 
            input_size,
            padding_size,
            kernel_size, 
            dilation_rate, 
            lstm_hidden_size=128
            ):
        super(ConvLSTMHybrid, self).__init__()

        # ensure padding is even
        padding_size = padding_size // 2

        # Convolutional layers
        self.temporal_conv = nn.Conv1d(in_channels=input_size, out_channels=input_size, kernel_size=kernel_size, padding=padding_size)
        self.dilated_conv = nn.Conv1d(in_channels=input_size, out_channels=lstm_hidden_size, kernel_size=kernel_size, dilation=dilation_rate, padding=dilation_rate*(kernel_size - 1) // 2)

        # LSTM cell
        self.lstm_cell = nn.LSTMCell(input_size=lstm_hidden_size, hidden_size=lstm_hidden_size)

        # Output layer
        self.fc = nn.Linear(lstm_hidden_size, 1)

    def forward(self, x):
        # Convolutional operations
        x = self.temporal_conv(x)
        x = self.dilated_conv(x)

        # LSTM operations
        batch_size, conv_out_channels, seq_len = x.size()
        h_t, c_t = torch.zeros(batch_size, self.lstm_cell.hidden_size).to(x.device), torch.zeros(batch_size, self.lstm_cell.hidden_size).to(x.device)

        outputs = []
        for t in range(seq_len):
            h_t, c_t = self.lstm_cell(x[:, :, t], (h_t, c_t))
            outputs.append(h_t.unsqueeze(1))

        outputs = torch.cat(outputs, dim=1)
        out = self.fc(outputs[:, -1, :])

        return out

In [203]:
model = ConvLSTMHybrid(input_size=1, kernel_size=5, padding_size=3, dilation_rate=2, lstm_hidden_size=128)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(20):
    optimizer.zero_grad()
    outputs = model(inputs.unsqueeze(0))
    loss = loss_fn(outputs, targets.unsqueeze(0))
    loss.backward()
    optimizer.step()


Using a target size (torch.Size([1, 1, 1825])) that is different to the input size (torch.Size([1, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.



In [21]:
# Setup a forecasting network
import torch.nn as nn

class ConvLSTM(nn.Module):
    def __init__(
            self, 
            input_size, 
            horizon, 
            cnv_layer_qty=2,
            cnv_padding=[2, 1],
            cnv_kernel_size=[5, 3],
            cnv_activation="silu",
            lstm_hidden_size=[128, 64],
            lstm_dropout=[0.1, 0.5],
            lstm_activation="silu",
            linear_layer_size=[64, 32],
            linear_layer_dropout=[0.1, 0.5],
            linear_layer_activation="silu",
            ):
        super(ConvLSTM, self).__init__()
        
        self.input_size = input_size
        self.horizon=horizon

        # Dictionary mapping activation names to nn modules
        self.activation_map = {
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh(),
            'relu': nn.ReLU(),
            'silu': nn.SiLU(),
            'leakyrelu': nn.LeakyReLU(),
            'identity': nn.Identity()
        }

        # Convolutional layers
        self.conv_layers = nn.ModuleList()
        for i in range(cnv_layer_qty):
            layer_input_channels = self.input_size if i == 0 else hidden_channels
            conv_layer = nn.Conv1d(
                layer_input_channels,
                hidden_channels,
                kernel_size,
                padding=padding
            )
            self.conv_layers.append(conv_layer)

        # LSTM layer after convolutions
        self.lstm = nn.LSTM(
            input_size=hidden_channels,
            hidden_size=hidden_channels,
            num_layers=1,
            batch_first=True
        )

    def forward(self, x):
        batch_size, channels, seq_len = x.size()
        
        # Apply convolutional layers
        for conv in self.conv_layers:
            x = self.activation(conv(x))
            
        # Reshape for LSTM: (batch, channels, seq_len) -> (batch, seq_len, channels)
        x = x.permute(0, 2, 1)
        
        # Apply LSTM
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        return lstm_out, (h_n, c_n)


In [20]:
class ConvLSTMForecaster(nn.Module):
    def __init__(
            self, 
            horizon,
            
            cnv_layer_qty=2,
            cnv_in_channels=[128],
            cnv_padding=[2, 1],
            cnv_kernel_size=[5, 3],
            cnv_activation=["silu", "silu"],
            lstm_layer_qty=2,
            lstm_hidden_size=[128, 64],
            lstm_dropout=0.1,
            lstm_activation="silu",
            lstm_dropout=[0.1, 0.5],
            linear_layer_b_qty=2,
            linear_layer_b_size=[64, 32],
            linear_layer_b_dropout=[0.1, 0.5],
            linear_layer_b_activation=["silu", "silu"],
            ):
    
        


IndentationError: expected an indented block after function definition on line 5 (3388822989.py, line 27)