In [1]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, RecurrentNetwork, QuantileLoss
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [2]:
# load data: this is pandas dataframe with at least a column for
# * the target (what you want to predict)
# * the timeseries ID (which should be a unique string to identify each timeseries)
# * the time of the observation (which should be a monotonically increasing integer)
TRANSFORM_DATA = True
if TRANSFORM_DATA:
    raw_data = pd.read_csv('spotData.csv')
    raw_data = raw_data.loc[raw_data['2021-11-15 16:40:32.509429'] != 'N/A*']#.head(128)
    date_columns = [x for x in raw_data.columns if x.startswith('2021')]
    not_date_columns = [x for x in raw_data.columns if not x.startswith('2021')]
    raw_data = raw_data.melt(id_vars=not_date_columns, value_vars=date_columns, var_name='date', value_name='blabla').drop(columns=['blabla'])
    raw_data.to_csv('data.csv', index=False)
else:
    raw_data = pd.read_csv('data.csv')

In [3]:
data = raw_data.astype({
    'Region': 'category',
    'instanceType': 'category',
    'major': 'category',
    'minor': 'category',
    'Type': 'category',
    'OS': 'category',
    'date': 'datetime64'
})
dates = {v: k for k, v in enumerate(data['date'].drop_duplicates().sort_values())}
data['time_idx'] = data['date'].apply(lambda x: dates[x])
# data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce').fillna(-1)
data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce')

In [4]:
data[data['date'] <= '2021-12-03']

Unnamed: 0,Region,instanceType,major,minor,Type,OS,Price,date,time_idx
0,us-east,a1.medium,a1,medium,generalCurrentGen,linux,0.0084,2021-11-15 16:40:32.509429,0
1,us-east,a1.large,a1,large,generalCurrentGen,linux,0.0217,2021-11-15 16:40:32.509429,0
2,us-east,a1.xlarge,a1,xlarge,generalCurrentGen,linux,0.0341,2021-11-15 16:40:32.509429,0
3,us-east,a1.2xlarge,a1,2xlarge,generalCurrentGen,linux,0.0671,2021-11-15 16:40:32.509429,0
4,us-east,a1.4xlarge,a1,4xlarge,generalCurrentGen,linux,0.1343,2021-11-15 16:40:32.509429,0
...,...,...,...,...,...,...,...,...,...
7610217,eu-south-1,m5d.metal,m5d,metal,hiMemCurrentGen,mswin,5.9431,2021-12-02 23:46:42.905937,738
7610218,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,linux,1.5998,2021-12-02 23:46:42.905937,738
7610219,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,mswin,6.0158,2021-12-02 23:46:42.905937,738
7610220,eu-south-1,r5d.metal,r5d,metal,hiMemCurrentGen,linux,1.5998,2021-12-02 23:46:42.905937,738


In [25]:
# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 2 * 24 * 7
max_prediction_length = 2 * 24 * 2
training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    # data[data['date'] <= training_cutoff],#data[lambda x: x.date <= training_cutoff],
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx='time_idx',  # column name of time of observation
    target='Price',  # column name of target to predict
    group_ids=['Region', 'OS', 'instanceType'],  # column name(s) for timeseries IDs
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,  # how much history to use
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=['major', 'minor', 'Type'],
    # static_reals=[ ... ],
    # covariates known and unknown in the future to inform prediction
    time_varying_known_categoricals=[],
    time_varying_unknown_categoricals=[],
    time_varying_known_reals=['time_idx'],
    time_varying_unknown_reals=['Price'],
    # target_normalizer=NaNLabelEncoder(add_nan=True),
)

In [26]:
# create validation dataset using the same normalization techniques as for the training dataset
#validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training.index.time.max() + 1, stop_randomization=True)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# convert datasets to dataloaders for training
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)

In [27]:
len(training), len(validation)

(8938664, 10298)

In [28]:
import warnings
warnings.filterwarnings("ignore")
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs")
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [29]:
# define network to train - the architecture is mostly inferred from the dataset, so that only a few hyperparameters have to be set by the user
tft = TemporalFusionTransformer.from_dataset(
    # dataset
    training,
    # architecture hyperparameters
    hidden_size=32,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=16,
    # loss metric to optimize
    loss=QuantileLoss(),
    # logging frequency
    log_interval=2,
    # optimizer parameters
    learning_rate=0.03,
    reduce_on_plateau_patience=4
)

print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# # find the optimal learning rate
# res = trainer.tuner.lr_find(
#     tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, early_stop_threshold=1000.0, max_lr=0.3,
# )

Number of parameters in network: 64.0k


In [30]:
# # and plot the result - always visually confirm that the suggested learning rate makes sense
# print(f"suggested learning rate: {res.suggestion()}")
# fig = res.plot(show=True, suggest=True)
# fig.show()

In [31]:
# fit the model on the data - redefine the model with the correct learning rate if necessary
trainer.fit(
    tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 1.4 K 
3  | prescalers                         | ModuleDict                      | 96    
4  | static_variable_selection          | VariableSelectionNetwork        | 2.4 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 3.8 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.8 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [32]:
trainer.validate(tft, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_MAE': 1.6829328330913995e-07,
 'val_MAPE': 2.2160266155424324e-07,
 'val_RMSE': 5.422189133241773e-07,
 'val_SMAPE': 2.2160266155424324e-07,
 'val_loss': 7.177621341725171e-08}
--------------------------------------------------------------------------------


[{'val_loss': 7.177621341725171e-08,
  'val_SMAPE': 2.2160266155424324e-07,
  'val_MAE': 1.6829328330913995e-07,
  'val_RMSE': 5.422189133241773e-07,
  'val_MAPE': 2.2160266155424324e-07}]

In [33]:
# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [34]:
from pytorch_forecasting import Baseline

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
our_prediction_mae = (actuals - predictions).abs().mean()
print(f'our_prediction_mae={our_prediction_mae}')

baseline_predictions = Baseline().predict(val_dataloader)
baseline_prediction_mae = (actuals - baseline_predictions).abs().mean().item()
print(f'baseline_prediction_mae={baseline_prediction_mae}')

our_prediction_mae=1.6422191606579872e-07
baseline_prediction_mae=0.0


In [37]:
len(train_dataloader), len(val_dataloader)

(69833, 81)