In [1]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch_lightning
    !pip install neuralforecast

In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

import torch
from torch.utils.data import DataLoader

import pytorch_lightning as pl

import neuralforecast as nf
from neuralforecast.data.datasets.epf import EPF
from pytorch_lightning.callbacks import EarlyStopping
from neuralforecast.data.tsloader import TimeSeriesLoader
from neuralforecast.experiments.utils import create_datasets
from neuralforecast.data.tsdataset import IterateWindowsDataset

In [3]:
warnings.filterwarnings("ignore") # Comment out 

In [4]:
VAL_PERC = .1
TEST_PERC = .1
first_n=1000 #Grab the first n time-series
N_TIME_SERIES = first_n

In [5]:
mc_model = {}

mc_model['seq_len'] = 96 # Input sequence size.
mc_model['label_len'] = 96 // 2 # Label sequence size. (Input buffer length for decoder)
mc_model['pred_len'] = 96 # Prediction sequence size.
mc_model['output_attention'] = False # If true use output attention for Transformer model.
mc_model['enc_in'] = N_TIME_SERIES #  Number of encoders in data embedding layers.
mc_model['dec_in'] = N_TIME_SERIES #  Number of decoders in data embedding layers.
mc_model['d_model'] = 512 #  Number of nodes for embedding layers.
mc_model['c_out'] = N_TIME_SERIES # Number of output nodes in projection layer.
mc_model['embed'] = 'timeF' #  Type of embedding layers.
mc_model['freq'] = 'h' # Frequency for embedding layers.
mc_model['dropout'] = 0.05 # Float between (0, 1). Dropout for Transformer.
mc_model['factor'] = 1 # Factor for attention layer.
mc_model['n_heads'] = 8 #  Number of heads in attention layer.
mc_model['d_ff'] = 2_048 #  Number of inputs in encoder layers.
mc_model['moving_avg'] = 25  #  Moving average for encoder and decoder layers.
mc_model['activation'] = 'gelu' #  Activation function for encoder layer.
mc_model['e_layers'] = 2 # Number of encoder layers.
mc_model['d_layers'] = 1 # Number of decoder layers.
mc_model['loss_train'] = 'MAE' # Loss to optimize. An item from ['MAPE', 'MASE', 'SMAPE', 'MSE', 'MAE', 'QUANTILE', 'QUANTILE2']. 
mc_model['loss_hypar'] = 0.5 # Hyperparameter for chosen loss.
mc_model['loss_valid'] = 'MAE'# Validation loss.An item from ['MAPE', 'MASE', 'SMAPE', 'RMSE', 'MAE', 'QUANTILE'].
mc_model['learning_rate'] = 0.001 # Learning rate between (0, 1).
mc_model['lr_decay'] = 0.5 # Decreasing multiplier for the learning rate.
mc_model['weight_decay'] = 0. # L2 penalty for optimizer.
mc_model['lr_decay_step_size'] = 2 # Steps between each learning rate decay.
mc_model['random_seed'] = 1 # random_seed for pseudo random pytorch initializer and numpy random generator.

# Dataset parameters
mc_data = {}
mc_data['mode'] = 'iterate_windows'
mc_data['n_time_in'] = mc_model['seq_len'] # Input sequence length
mc_data['n_time_out'] = mc_model['pred_len'] # Prediction sequence length
mc_data['batch_size'] = 1 # Batch size 
mc_data['normalizer_y'] = None 
mc_data['normalizer_x'] = None
mc_data['max_epochs'] = 1 # Maximum number of training epochs
mc_data['max_steps'] = None # maximum number of training steps
mc_data['early_stop_patience'] = 20 #Number of consecutive violations of early stopping criteria to end training


In [6]:
first_n=1000
df=pd.read_pickle("m5_aggregate_df.pkl")
df=df.loc[df['id'].isin(df['id'].unique()[0:first_n])]
df.head()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,cat_sold_avg,dept_sold_avg,cat_dept_sold_avg,store_item_sold_avg,cat_item_sold_avg,dept_item_sold_avg,state_store_sold_avg,state_store_cat_sold_avg,store_cat_dept_sold_avg,rolling_sold_mean
1067150,14370,1437,3,1,0,0,36,0,2011-03-05,11106,...,0.561035,0.695801,0.695801,0.321533,0.216553,0.216553,1.304688,0.801758,1.020508,1.142578
1067151,14380,1438,3,1,0,0,36,0,2011-03-05,11106,...,0.561035,0.695801,0.695801,0.253906,0.259766,0.259766,1.304688,0.801758,1.020508,1.142578
1067152,14390,1439,3,1,0,0,36,0,2011-03-05,11106,...,0.561035,0.695801,0.695801,0.156982,0.07666,0.07666,1.304688,0.801758,1.020508,0.428467
1067153,14400,1440,3,1,0,0,36,0,2011-03-05,11106,...,0.561035,0.695801,0.695801,1.694336,2.011719,2.011719,1.304688,0.801758,1.020508,0.285645
1067154,14410,1441,3,1,0,0,36,0,2011-03-05,11106,...,0.561035,0.695801,0.695801,0.958984,0.755371,0.755371,1.304688,0.801758,1.020508,0.0


In [7]:
df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sold', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sold_lag_1',
       'sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12', 'sold_lag_24',
       'sold_lag_36', 'iteam_sold_avg', 'state_sold_avg', 'store_sold_avg',
       'cat_sold_avg', 'dept_sold_avg', 'cat_dept_sold_avg',
       'store_item_sold_avg', 'cat_item_sold_avg', 'dept_item_sold_avg',
       'state_store_sold_avg', 'state_store_cat_sold_avg',
       'store_cat_dept_sold_avg', 'rolling_sold_mean'],
      dtype='object')

In [8]:
#Static variables
S_df = None

#Training Data
Y_df=df[['id','date','sold']]
Y_df.columns=['unique_id','ds','y']

#Exogenous Variable Training Data
X_df=df[['id','date','sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12']]
X_df.columns=['unique_id','ds','ex_1','ex_2','ex_3','ex_4']

f_cols = X_df.drop(columns=['unique_id', 'ds']).columns.to_list()

In [9]:
n_ds = Y_df["ds"].nunique()
n_val = int(VAL_PERC * n_ds)
n_test = int(TEST_PERC * n_ds)

train_dataset, val_dataset, test_dataset, scaler_y = create_datasets(mc=mc_data,
                                                                     S_df=None, 
                                                                     Y_df=Y_df, X_df=X_df,
                                                                     f_cols=f_cols,
                                                                     ds_in_val=n_val,
                                                                     ds_in_test=n_test)



INFO:root:Train Validation splits

INFO:root:                    ds           
                   min        max
sample_mask                      
0           2015-05-31 2016-06-19
1           2011-03-05 2015-05-30
INFO:root:
Total data 			1934000 time stamps 
Available percentage=100.0, 	1934000 time stamps 
Insample  percentage=80.04, 	1548000 time stamps 
Outsample percentage=19.96, 	386000 time stamps 

INFO:root:Train Validation splits

INFO:root:                    ds           
                   min        max
sample_mask                      
0           2011-03-05 2016-06-19
1           2015-05-31 2015-12-09
INFO:root:
Total data 			1934000 time stamps 
Available percentage=100.0, 	1934000 time stamps 
Insample  percentage=9.98, 	193000 time stamps 
Outsample percentage=90.02, 	1741000 time stamps 

INFO:root:Train Validation splits

INFO:root:                    ds           
                   min        max
sample_mask                      
0           2011-03-05 2015-12-0

In [10]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=int(mc_data['batch_size']),
                          shuffle=True,
                          drop_last=True)

val_loader = DataLoader(dataset=val_dataset,
                        batch_size=int(mc_data['batch_size']),
                        shuffle=False)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=int(mc_data['batch_size']),
                         shuffle=False)

In [11]:
model = nf.models.transformer.autoformer.Autoformer(**mc_model)


In [12]:
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', 
                                            min_delta=1e-4, 
                                            patience=mc_data['early_stop_patience'],
                                            verbose=False,
                                            mode="min")

trainer = pl.Trainer(max_epochs=mc_data['max_epochs'], 
                     max_steps=mc_data['max_steps'],
                     gradient_clip_val=1.0,
                     progress_bar_refresh_rate=10, 
                     check_val_every_n_epoch=1,
                     log_every_n_steps=500, 
                     callbacks=[early_stopping])

trainer.fit(model, train_loader, val_loader)


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Set SLURM handle signals.

  | Name  | Type        | Params
--------------------------------------
0 | model | _Autoformer | 15.6 M
--------------------------------------
15.6 M    Trainable params
0         Non-trainable params
15.6 M    Total params
62.484    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]