In [None]:
%load_ext autoreload
%autoreload 2

### Try TemporalFusionTransformer
* Read: https://towardsdatascience.com/temporal-fusion-transformer-a-primer-on-deep-forecasting-in-python-4eb37f3f3594

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys

import pandas as pd
import numpy as np

import torch
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss, SMAPE, MAE, RMSE, MAPE
from pytorch_forecasting.data import GroupNormalizer
from pytorch_lightning.loggers import WandbLogger


DIR_PROJECT = '/media/user/12TB1/HanLi/GitHub/CMU11785-project/'
sys.path.append(os.path.join(DIR_PROJECT, 'src'))
sys.path.append(os.path.join(DIR_PROJECT, 'utils'))

from criterions import Pearson

DIR_DATA = os.path.join(DIR_PROJECT, 'local_data')
DIR_LOGS = os.path.join(DIR_PROJECT, 'logs')
DIR_TRAINED = os.path.join(DIR_PROJECT, 'local_trained/20220428')
NUM_WORKERS = 16 # Use 4 for AWS

ARGS = args = {
    # ------------------------------
    # Basic config
    'random_seed': 11785,
    'n_samples': 1000,
    'batch_size': 64,
    'n_workers' : NUM_WORKERS,
    'criterion': {
        'quantile': QuantileLoss(),
        'pearson': Pearson.Pearson(),   # Miao's implementation
        'other': None,                  # TODO: check out other loss (e.g., MSE)
    },
    # ------------------------------
    # Hyperparameters
    'lr_s': 2e-1,
    'hidden_size': 256,
    'attention_head_size': 1,        # use multihead for large hidden size
    'dropout': 0.1,
    'hidden_continuous_size': 8,     # set to <= hidden_size
    'output_size': 7,                # 7 quantiles for QuantileLoss by default
    'reduce_on_plateau_patience': 4, # reduce learning rate if no improvement in validation loss after x epochs
    'gradient_clip_val': 0.1,
    # ------------------------------
    # Logging
    'logging_metrics': [QuantileLoss(), SMAPE(), MAE(), RMSE(), MAPE()],
    # 'logging_metrics': [SMAPE(), MAE(), RMSE(), MAPE()],
    'log_interval': 5,               # log every n batches, set to None when try to find best lr
    'wandb_entity': '11785_project',
    'wandb_project': '11785_pf_test',
    'wandb_name': 'test_run_1',
}
seed_everything(ARGS['random_seed'], workers=True)

Global seed set to 11785


11785

### Create dataset and dataloaders

In [3]:
# load data, create validation and training dataset
dir_pf_dataset = os.path.join(DIR_DATA, 'pf_dataset_tft')
n = args['n_samples']

train_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_train_{n}_samples.pf'))
val_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_val_{n}_samples.pf'))
test_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_test_{n}_samples.pf'))

# create dataloaders for model
train_dataloader = train_dataset.to_dataloader(train=True, batch_size=args['batch_size'], num_workers=args['n_workers'])
val_dataloader = val_dataset.to_dataloader(train=False, batch_size=args['batch_size'], num_workers=args['n_workers'])
test_dataloader = test_dataset.to_dataloader(train=False, batch_size=args['batch_size'], num_workers=args['n_workers'])

print("Load existing dataset completed.")

Load existing dataset completed.


### Tune hyperparameters with Optuna

In [2]:
import optuna
import copy
import wandb

def objective(trial: optuna.trial.Trial) -> float:
    trial_args = copy.deepcopy(ARGS)

    # load data, create validation and training dataset
    dir_pf_dataset = os.path.join(DIR_DATA, 'pf_dataset_tft')
    args['n_training_samples'] = n = trial.suggest_int('n_training_samples', 500, 1000, 500) # Treat amount of training samples as a hyperparameter

    train_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_train_{n}_samples.pf'))
    val_dataset = TimeSeriesDataSet.load(os.path.join(dir_pf_dataset, f'pf_val_500_samples.pf')) # NOTE: Use 500 samples for validation and testing
    # create dataloaders for model
    train_dataloader = train_dataset.to_dataloader(train=True, batch_size=trial_args['batch_size'], num_workers=trial_args['n_workers'])
    val_dataloader = val_dataset.to_dataloader(train=False, batch_size=trial_args['batch_size'], num_workers=trial_args['n_workers'])

    lr = trial.suggest_loguniform('init_lr', 1e-4, 1e-1)
    lstm_layers = trial.suggest_int('lstm_layers', 1, 3, 1)
    hidden_size = trial.suggest_int('hidden_size', 128, 768, 128)
    dropout = trial.suggest_float('dropout', 0.0, 0.4, step=0.1)
    attention_head_size = trial.suggest_int('attention_head_size', 1, 4, 1)
    hidden_continuous_size = trial.suggest_int('hidden_continuous_size', 4, 16, 4)

    print(f'Trial {trial.number}'.center(50, "_"))
    print("==>> n: ", n)
    print("==>> lr: ", lr)
    print("==>> lstm_layers: ", lstm_layers)
    print("==>> hidden_size: ", hidden_size)
    print("==>> dropout: ", dropout)
    print("==>> attention_head_size: ", attention_head_size)
    print("==>> hidden_continuous_size: ", hidden_continuous_size)

    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
    lr_logger = LearningRateMonitor()  # log the learning rate

    # For saving model
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss', # val_SMAPE
        dirpath=os.path.join(DIR_LOGS, f'trial_{trial.number}_ckpt'), 
        save_top_k=2, 
        filename=f'trial_{trial.number}_'+'{epoch:02d}-{val_loss:.2f}-{val_RMSE:.2f}'
    )

    logger = WandbLogger(
        log_model=True,
        entity="11785_project",
        project="11785_project_tuning_427",
        name=f'TFT_quantile_loss_tune_{trial.number}',
        reinit=True
    )
    # wb_run = wandb.init(
    #     entity="11785_project",
    #     project="11785_project_tuning",
    #     name=f'TFT_quantile_loss_tune_{trial.number}',
    #     reinit=True,
    # )

    trainer = pl.Trainer(
        max_epochs=30,
        gpus=1,
        weights_summary="top",
        gradient_clip_val=0.1,
        limit_train_batches=0.1,  # use 10% fo batches for training for fast tuning
        # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
        callbacks=[lr_logger, early_stop_callback, checkpoint_callback],
        logger=logger,
    )
    trainer.logger.log_hyperparams(trial_args)

    tft_model = TemporalFusionTransformer.from_dataset(
        train_dataset,
        learning_rate=lr,
        lstm_layers=lstm_layers,
        hidden_size=hidden_size,  # most important hyperparameter apart from learning rate
        attention_head_size=attention_head_size, # number of attention heads. Set to up to 4 for large datasets
        dropout=dropout,  # between 0.1 and 0.3 are good values
        hidden_continuous_size=hidden_continuous_size,
        output_size=args['output_size'],
        loss=args['criterion']['quantile'],
        # loss=args['criterion']['pearson'],
        log_interval=args['log_interval'],  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
        reduce_on_plateau_patience=args['reduce_on_plateau_patience'], # reduce learning rate if no improvement in validation loss after x epochs
    )

    # fit network
    trainer.fit(tft_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
    torch.cuda.empty_cache()
    # wb_run.finish()
    wandb.finish()
    torch.save(tft_model.state_dict(), os.path.join(DIR_TRAINED, f'tft_tuning_{trial.number}.pth'))
    return trainer.callback_metrics["val_loss"].item()

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=60, n_jobs=1)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

### Test model on the test dataset

In [None]:
# test_results = trainer.test(tft_model, dataloaders=test_dataloader)


# 1 2 3 4 5 | 6 7 8
# 2 3 4 5 6 | 7 8 9

3 * 7
# MSE, Quantile, PearsonR

