In [1]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
# import dataset, network to train and metric to optimize
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer, RecurrentNetwork, QuantileLoss
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [2]:
from pytorch_forecasting.data.examples import get_stallion_data

In [19]:
# load data: this is pandas dataframe with at least a column for
# * the target (what you want to predict)
# * the timeseries ID (which should be a unique string to identify each timeseries)
# * the time of the observation (which should be a monotonically increasing integer)
TRANSFORM_DATA = True
if TRANSFORM_DATA:
    raw_data = pd.read_csv('spotData.csv')
    date_columns = [x for x in raw_data.columns if x.startswith('2021')]
    not_date_columns = [x for x in raw_data.columns if not x.startswith('2021')]
    raw_data = raw_data.melt(id_vars=not_date_columns, value_vars=date_columns, var_name='date',
                             value_name='blabla').drop(columns=['blabla'])
    raw_data.to_csv('data.csv', index=False)
else:
    raw_data = pd.read_csv('data.csv')

In [36]:
d = pd.read_csv('spotData.csv')
raw_data.drop(raw_data[raw_data.Price=="N/A*"].index)

Unnamed: 0,Region,instanceType,major,minor,Type,OS,Price,date
0,us-east,a1.medium,a1,medium,generalCurrentGen,linux,0.0084,2021-11-15 16:40:32.509429
2,us-east,a1.large,a1,large,generalCurrentGen,linux,0.0217,2021-11-15 16:40:32.509429
4,us-east,a1.xlarge,a1,xlarge,generalCurrentGen,linux,0.0341,2021-11-15 16:40:32.509429
6,us-east,a1.2xlarge,a1,2xlarge,generalCurrentGen,linux,0.0671,2021-11-15 16:40:32.509429
8,us-east,a1.4xlarge,a1,4xlarge,generalCurrentGen,linux,0.1343,2021-11-15 16:40:32.509429
...,...,...,...,...,...,...,...,...
16059069,eu-south-1,m5d.metal,m5d,metal,hiMemCurrentGen,mswin,5.9431,2021-12-05 16:46:39.137754
16059070,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,linux,1.5998,2021-12-05 16:46:39.137754
16059071,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,mswin,6.0158,2021-12-05 16:46:39.137754
16059072,eu-south-1,r5d.metal,r5d,metal,hiMemCurrentGen,linux,1.5998,2021-12-05 16:46:39.137754


In [17]:
raw_data.describe()

Unnamed: 0,Region,instanceType,major,minor,Type,OS,Price,date
count,16059120,16059120,16059120,16059120,16059120,16059120,16059120,16059120
unique,22,420,68,20,9,2,4143,869
top,us-east,a1.medium,c6gd,xlarge,generalCurrentGen,linux,N/A*,2021-11-15 16:40:32.509429
freq,729960,38236,344124,2179452,4855972,8029560,7109289,18480


In [21]:
len(d) * (876 - 7) - len(raw_data)

0

In [37]:
data = raw_data.astype({
    'Region': 'category',
    'instanceType': 'category',
    'major': 'category',
    'minor': 'category',
    'Type': 'category',
    'OS': 'category',
    'date': 'datetime64'
})
dates = {v: k for k, v in enumerate(data['date'].drop_duplicates().sort_values())}
data['time_idx'] = data['date'].apply(lambda x: dates[x])
# data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce').fillna(-1)
data['Price'] = pd.to_numeric(raw_data['Price'], errors='coerce')

In [41]:
data[data['date'] <= '2021-12-01']
data = data.dropna()
data

Unnamed: 0,Region,instanceType,major,minor,Type,OS,Price,date,time_idx
0,us-east,a1.medium,a1,medium,generalCurrentGen,linux,0.0084,2021-11-15 16:40:32.509429,0
2,us-east,a1.large,a1,large,generalCurrentGen,linux,0.0217,2021-11-15 16:40:32.509429,0
4,us-east,a1.xlarge,a1,xlarge,generalCurrentGen,linux,0.0341,2021-11-15 16:40:32.509429,0
6,us-east,a1.2xlarge,a1,2xlarge,generalCurrentGen,linux,0.0671,2021-11-15 16:40:32.509429,0
8,us-east,a1.4xlarge,a1,4xlarge,generalCurrentGen,linux,0.1343,2021-11-15 16:40:32.509429,0
...,...,...,...,...,...,...,...,...,...
16059069,eu-south-1,m5d.metal,m5d,metal,hiMemCurrentGen,mswin,5.9431,2021-12-05 16:46:39.137754,868
16059070,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,linux,1.5998,2021-12-05 16:46:39.137754,868
16059071,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,mswin,6.0158,2021-12-05 16:46:39.137754,868
16059072,eu-south-1,r5d.metal,r5d,metal,hiMemCurrentGen,linux,1.5998,2021-12-05 16:46:39.137754,868


In [42]:
max_encoder_length = 768
max_prediction_length = 100
training_cutoff = data["time_idx"].max() - max_prediction_length
print(training_cutoff)
dateCutOff = data[lambda x: x.time_idx <= training_cutoff]
dateCutOff

768


Unnamed: 0,Region,instanceType,major,minor,Type,OS,Price,date,time_idx
0,us-east,a1.medium,a1,medium,generalCurrentGen,linux,0.0084,2021-11-15 16:40:32.509429,0
2,us-east,a1.large,a1,large,generalCurrentGen,linux,0.0217,2021-11-15 16:40:32.509429,0
4,us-east,a1.xlarge,a1,xlarge,generalCurrentGen,linux,0.0341,2021-11-15 16:40:32.509429,0
6,us-east,a1.2xlarge,a1,2xlarge,generalCurrentGen,linux,0.0671,2021-11-15 16:40:32.509429,0
8,us-east,a1.4xlarge,a1,4xlarge,generalCurrentGen,linux,0.1343,2021-11-15 16:40:32.509429,0
...,...,...,...,...,...,...,...,...,...
14211069,eu-south-1,m5d.metal,m5d,metal,hiMemCurrentGen,mswin,5.9431,2021-12-03 14:46:38.411744,768
14211070,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,linux,1.5998,2021-12-03 14:46:38.411744,768
14211071,eu-south-1,r5.metal,r5,metal,hiMemCurrentGen,mswin,6.0158,2021-12-03 14:46:38.411744,768
14211072,eu-south-1,r5d.metal,r5d,metal,hiMemCurrentGen,linux,1.5998,2021-12-03 14:46:38.411744,768


In [43]:
# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
from pytorch_forecasting.data import GroupNormalizer

max_encoder_length = 768
max_prediction_length = 100
training_cutoff = data["time_idx"].max() - max_prediction_length  # time for cutoff

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],  #data[data['date'] <= training_cutoff]
    time_idx='time_idx',  # column name of time of observation
    target='Price',  # column name of target to predict
    group_ids=['Region', 'instanceType', 'Type', 'OS'],  # column name(s) for timeseries IDs
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=['Region', 'instanceType', 'Type', 'OS'],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_reals=['Price'],
    target_normalizer=GroupNormalizer(
        groups=['Region', 'instanceType', 'Type', 'OS'], transformation="softplus"
    ),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)



In [48]:
training.target_normalizer

GroupNormalizer(groups=['Region', 'instanceType', 'Type', 'OS'],
                transformation='softplus')

In [49]:
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [55]:
print(training.data['reals'])


tensor([[ 0.0000e+00, -2.4435e-01,  8.4372e-23, -1.7298e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00, -2.4435e-01,  8.4372e-23, -1.7253e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00, -2.4435e-01,  8.4372e-23, -1.7208e+00,  0.0000e+00,
          0.0000e+00],
        ...,
        [ 0.0000e+00, -5.3104e-01,  8.4372e-23,  1.7208e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00, -5.3104e-01,  8.4372e-23,  1.7253e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00, -5.3104e-01,  8.4372e-23,  1.7298e+00,  0.0000e+00,
          0.0000e+00]])


In [31]:
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs")
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [51]:
# define network to train - the architecture is mostly inferred from the dataset, so that only a few hyperparameters have to be set by the user
tft = TemporalFusionTransformer.from_dataset(
    # dataset
    training,
    # architecture hyperparameters
    hidden_size=32,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=16,
    # loss metric to optimize
    loss=QuantileLoss(),
    # logging frequency
    log_interval=2,
    # optimizer parameters
    learning_rate=0.03,
    reduce_on_plateau_patience=4
)

print(f"Number of parameters in network: {tft.size() / 1e3:.1f}k")

# find the optimal learning rate
res = trainer.tuner.lr_find(
    tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, early_stop_threshold=1000.0, max_lr=0.3,
)

Number of parameters in network: 58.2k


  rank_zero_warn(
  target_scale = torch.tensor([batch[0]["target_scale"] for batch in batches], dtype=torch.float)
  target_scale = torch.tensor([batch[0]["target_scale"] for batch in batches], dtype=torch.float)


KeyboardInterrupt: 

In [None]:
# and plot the result - always visually confirm that the suggested learning rate makes sense
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

In [52]:
# fit the model on the data - redefine the model with the correct learning rate if necessary
trainer.fit(
    tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader,
)

  rank_zero_warn(
  target_scale = torch.tensor([batch[0]["target_scale"] for batch in batches], dtype=torch.float)
  target_scale = torch.tensor([batch[0]["target_scale"] for batch in batches], dtype=torch.float)


IndexError: list index out of range

In [54]:
len(train_dataloader), len(val_dataloader)

(102361, 23100)