# Imports

In [4]:
import pandas as pd
# disable chained assignments
pd.options.mode.chained_assignment = None 
import os, gc
import optuna, optuna_dashboard

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from datetime import datetime

from models import *
from utils import *
from splits import *

SEED = 7
tf.random.set_seed(SEED)
VERBOSE = 0
Split = Baseline

## Result folder

In [5]:
output_folder = 'scratch/tuning_LSTM'
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

# Preprocessing

In [6]:
df = pd.read_csv('../TFT-pytorch/2022_May_cleaned/Top_100.csv')
df['Date'] = pd.to_datetime(df['Date'])

   FIPS  AgeDist  HealthDisp       Date  DiseaseSpread  Transmission  \
0  2261    0.014         8.8 2020-02-29            0.0           0.0   
1  2261    0.014         8.8 2020-03-01            0.0           0.0   
2  2261    0.014         8.8 2020-03-02            0.0           0.0   

   VaccinationFull  SocialDist  Cases  TimeFromStart  SinWeekly  CosWeekly  
0              0.0         0.5    0.0              0     -0.975     -0.223  
1              0.0         0.5    0.0              1     -0.782      0.623  
2              0.0         0.5    0.0              2      0.000      1.000  


## Config

In [7]:
@dataclass
class Config:
    static_features = ['AgeDist', 'HealthDisp']
    past_features = ['DiseaseSpread', 'Transmission', 'VaccinationFull', 'SocialDist']
    known_future = ['SinWeekly', 'CosWeekly']
    time_index = 'TimeFromStart' # note that this is an index feature commonly used by all timeseries models

    features =  [time_index] + static_features + past_features + known_future
    targets = ['Cases']
    group_id = 'FIPS'
    selected_columns = features + targets
    input_sequence_length = 13
    output_sequence_length = 15
    batch_size = 64
    buffer_size = 1000
    epochs = 200
    learning_rate = 1e-6
    early_stopping_patience = 5
    loss = 'mse'
    n_trials = 25

targets = Config.targets
group_id = Config.group_id
input_sequence_length = Config.input_sequence_length
output_sequence_length = Config.output_sequence_length
output_size = len(targets) * output_sequence_length

## Split and scale

In [8]:
train_df, val_df, test_df = split_data(df, Split, input_sequence_length)
train_df, val_df, test_df, feature_scaler, target_scaler = scale_data(
    train_df, val_df, test_df, Config.features, targets
)

Shapes: train (64000, 12), validation (3000, 12), test (3000, 12).


## Window generator

In [None]:
x_train, y_train = prepare_dataset(
    train_df, Config, disable_progress_bar=(VERBOSE!=1)
)
x_val, y_val = prepare_dataset(
    val_df, Config, disable_progress_bar=(VERBOSE!=1)
)

# Training

## Model

In [17]:
def create_model(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    hidden_size = trial.suggest_int("hidden_size", 32, 128, step=16)
    dropout = trial.suggest_float("dropout", 0, 0.3, step=0.1)
    layers = trial.suggest_int("layers", 2, 4, step=1)

    model = build_LSTM(
        x_train.shape[1:], output_size=output_size, loss=Config.loss, 
        hidden_size=hidden_size, dropout=dropout, 
        learning_rate=learning_rate, layers=layers
    )
    return model

def create_dataset(trial):
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    train_data = cache_data(
        x_train, y_train, batch_size=batch_size, 
        buffer_size=Config.buffer_size
    )
    val_data = cache_data(
        x_val, y_val, batch_size=batch_size, 
    )
    return train_data, val_data

def objective(trial):
    model = create_model(trial)
    train_data, val_data = create_dataset(trial)

    early_stopping = EarlyStopping(
        patience = Config.early_stopping_patience, 
        restore_best_weights=True
    )
    model_checkpoint = ModelCheckpoint(
        filepath=os.path.join(output_folder, 'model.h5'), 
        save_best_only=True, save_weights_only=True
    )
    model.fit(
        train_data, validation_data=val_data,
        epochs=Config.epochs,  
        callbacks=[early_stopping, model_checkpoint],
        verbose=VERBOSE
    )
    model.load_weights(model_checkpoint.filepath)
    val_loss = model.evaluate(val_data, verbose=VERBOSE)

    return val_loss

In [22]:
study_name = 'LSTM'
storage_name = f"sqlite:///{study_name}.db"
load_only = False

if load_only:
    study = optuna.load_study(
        study_name=study_name, storage=storage_name
    )
else:
    study = optuna.create_study(
        study_name=study_name, storage=storage_name, direction='minimize', load_if_exists=True
    )
    study.optimize(
        objective, n_trials=Config.n_trials, n_jobs=-1, 
        gc_after_trial=True, show_progress_bar=VERBOSE
    )

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-12-22 23:06:23,640][0m Using an existing study with name 'LSTM' instead of creating a new one.[0m


  0%|          | 0/2 [00:00<?, ?it/s]

[32m[I 2022-12-22 23:06:43,910][0m Trial 2 finished with value: 0.7288652062416077 and parameters: {'learning_rate': 5.4126009713509914e-05, 'hidden_size': 128, 'dropout': 0.3, 'layers': 3}. Best is trial 2 with value: 0.7288652062416077.[0m
Number of finished trials:  4
Best trial:
  Value:  0.7288652062416077
  Params: 
    dropout: 0.3
    hidden_size: 128
    layers: 3
    learning_rate: 5.4126009713509914e-05


In [None]:
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)

df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.round(6).to_csv(os.path.join(output_folder, 'trials.csv'), index=False)