In [1]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, RecurrentNetwork, QuantileLoss
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [2]:
!rm almost_three_months.csv
!unzip almost3_50plus.zip

Archive:  almost3_50plus.zip
  inflating: almost_three_months.csv  


In [3]:
raw_data = pd.read_csv('almost_three_months.csv')

In [4]:
raw_data

Unnamed: 0,Region,instanceType,major,minor,OS,Price,date
0,eu-west-1c,t3.2xlarge,t3,2xlarge,Red Hat Enterprise Linux,0.2688,2021-12-26 16:40:00+00:00
1,eu-west-1c,t3.2xlarge,t3,2xlarge,Red Hat Enterprise Linux,0.2691,2021-12-26 11:20:00+00:00
2,eu-west-1c,t3.2xlarge,t3,2xlarge,Red Hat Enterprise Linux,0.2694,2021-12-26 04:30:00+00:00
3,eu-west-1c,t3.2xlarge,t3,2xlarge,Red Hat Enterprise Linux,0.2697,2021-12-25 22:30:00+00:00
4,eu-west-1c,t3.2xlarge,t3,2xlarge,Red Hat Enterprise Linux,0.2700,2021-12-25 16:20:00+00:00
...,...,...,...,...,...,...,...
2658534,ap-southeast-2a,r5d.xlarge,r5d,xlarge,Linux/UNIX,0.0775,2021-10-28 10:20:00+00:00
2658535,ap-southeast-2a,r5d.xlarge,r5d,xlarge,Linux/UNIX,0.0758,2021-12-23 21:20:00+00:00
2658536,ap-southeast-2a,r5d.xlarge,r5d,xlarge,Linux/UNIX,0.0776,2021-11-20 18:20:00+00:00
2658537,ap-southeast-2a,r5d.xlarge,r5d,xlarge,Linux/UNIX,0.0763,2021-11-09 19:20:00+00:00


In [6]:
data = raw_data.astype({
    'Region': 'category',
    'instanceType': 'category',
    'major': 'category',
    'minor': 'category',
    # 'Type': 'category',
    'OS': 'category',
    'date': 'datetime64'
})

# data = data[data.Region == 'ap-northeast-1a']
# data = data[data.instanceType == 'r6g.2xlarge']
# data = data[data.OS == 'Red Hat Enterprise Linux']

# dates = {v: k for k, v in enumerate(data['date'].drop_duplicates().sort_values())}
sorted_dates = data.date.sort_values()
min_date = sorted_dates.min()
max_date = sorted_dates.max()
dates = {}
current_date = min_date
i = 0
while current_date <= max_date:
    dates[current_date] = i
    i += 1
    current_date += pd.Timedelta(minutes=10)

data['time_idx'] = data['date'].apply(lambda x: dates[x])
# data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce').fillna(-1)
data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce')
data = data.sort_values(['time_idx', 'Region', 'OS', 'instanceType'])

In [7]:
data[data['date'] <= '2021-12-03']

Unnamed: 0,Region,instanceType,major,minor,OS,Price,date,time_idx
2043775,ca-central-1a,r5.xlarge,r5,xlarge,Linux/UNIX,0.0636,2021-10-26 17:20:00,0
2043381,ca-central-1a,r5.xlarge,r5,xlarge,Red Hat Enterprise Linux,0.1236,2021-10-26 17:20:00,0
2043578,ca-central-1a,r5.xlarge,r5,xlarge,SUSE Linux,0.1886,2021-10-26 17:20:00,0
730001,eu-north-1a,g4dn.metal,g4dn,metal,Linux/UNIX,2.4900,2021-10-26 17:20:00,0
729733,eu-north-1a,g4dn.metal,g4dn,metal,Red Hat Enterprise Linux,2.6200,2021-10-26 17:20:00,0
...,...,...,...,...,...,...,...,...
1873594,us-west-2d,r6g.4xlarge,r6g,4xlarge,Red Hat Enterprise Linux,0.5376,2021-12-03 00:00:00,5368
1887942,us-west-2d,c5.4xlarge,c5,4xlarge,SUSE Linux,0.3606,2021-12-03 00:00:00,5368
1870297,us-west-2d,m5.12xlarge,m5,12xlarge,SUSE Linux,1.1365,2021-12-03 00:00:00,5368
1885071,us-west-2d,r5d.4xlarge,r5d,4xlarge,SUSE Linux,1.0176,2021-12-03 00:00:00,5368


In [8]:
# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 300
max_prediction_length = 150
training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    # data[data['date'] <= training_cutoff],#data[lambda x: x.date <= training_cutoff],
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx='time_idx',  # column name of time of observation
    target='Price',  # column name of target to predict
    group_ids=['Region', 'OS', 'instanceType'],  # column name(s) for timeseries IDs
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,  # how much history to use
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=['major', 'minor'],
    # static_reals=[ ... ],
    # covariates known and unknown in the future to inform prediction
    time_varying_known_categoricals=[],
    time_varying_unknown_categoricals=[],
    time_varying_known_reals=['time_idx'],
    time_varying_unknown_reals=['Price'],
    # target_normalizer=NaNLabelEncoder(add_nan=True),
    allow_missing_timesteps=True
)

In [9]:
# create validation dataset using the same normalization techniques as for the training dataset
#validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training.index.time.max() + 1, stop_randomization=True)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# convert datasets to dataloaders for training
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)



In [10]:
len(training), len(validation)

(3045212, 15085)

In [11]:
import warnings
warnings.filterwarnings("ignore")
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs")
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [12]:
# define network to train - the architecture is mostly inferred from the dataset, so that only a few hyperparameters have to be set by the user
tft = TemporalFusionTransformer.from_dataset(
    # dataset
    training,
    # architecture hyperparameters
    hidden_size=32,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=16,
    # loss metric to optimize
    loss=QuantileLoss(),
    # logging frequency
    log_interval=2,
    # optimizer parameters
    learning_rate=0.03,
    reduce_on_plateau_patience=4
)

print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# # find the optimal learning rate
# res = trainer.tuner.lr_find(
#     tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, early_stop_threshold=1000.0, max_lr=0.3,
# )

Number of parameters in network: 63.7k


In [13]:
# # and plot the result - always visually confirm that the suggested learning rate makes sense
# print(f"suggested learning rate: {res.suggestion()}")
# fig = res.plot(show=True, suggest=True)
# fig.show()

In [14]:
# fit the model on the data - redefine the model with the correct learning rate if necessary
trainer.fit(
    tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 1.3 K 
3  | prescalers                         | ModuleDict                      | 96    
4  | static_variable_selection          | VariableSelectionNetwork        | 2.2 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 3.8 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.8 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [15]:
trainer.validate(tft, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_MAE': 0.01922227442264557,
 'val_MAPE': 0.015747494995594025,
 'val_RMSE': 0.07389326393604279,
 'val_SMAPE': 0.01550079695880413,
 'val_loss': 0.004534061532467604}
--------------------------------------------------------------------------------


[{'val_loss': 0.004534061532467604,
  'val_SMAPE': 0.01550079695880413,
  'val_MAE': 0.01922227442264557,
  'val_RMSE': 0.07389326393604279,
  'val_MAPE': 0.015747494995594025}]

In [16]:
# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [17]:
from pytorch_forecasting import Baseline

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
our_prediction_mae = (actuals - predictions).abs().mean()
print(f'our_prediction_mae={our_prediction_mae}')

baseline_predictions = Baseline().predict(val_dataloader)
baseline_prediction_mae = (actuals - baseline_predictions).abs().mean().item()
print(f'baseline_prediction_mae={baseline_prediction_mae}')

our_prediction_mae=0.01922227069735527
baseline_prediction_mae=0.00724204583093524


In [18]:
len(train_dataloader), len(val_dataloader)

(23790, 118)