Hyperparameter tuning with Optuna. same as study 12 except restrict some of the parameter values even further. gncore. still only 5 edges per node. 200 epochs. 

### imports and setup

In [1]:
# ipython extension to autoreload imported modules so that any changes will be up to date before running code in this nb
%load_ext autoreload 
%autoreload 2

In [2]:
from utils.jraph_training import train_and_evaluate_with_data, create_dataset
# from utils.jraph_models import MLPGraphNetwork
from utils.jraph_data import print_graph_fts
from utils.jraph_vis import plot_predictions
from utils.hyperparam_tuning import remove_bad_trials, get_best_trial_config, get_best_trial_workdir
import ml_collections
import optuna 
from flax import linen as nn
from functools import partial
from datetime import datetime
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# set up logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

### set up functions for optuna

In [4]:
CHECKPOINT_PATH = "/Users/h.lu/Documents/_code/_research lorenz code/lorenzGNN/experiments/tuning"

In [19]:
def objective(trial, study_name, datasets):
    """ Defines the objective function to be optimized over, aka the validation loss of a model.
    
        Args:
            trial: object which characterizes the current run 
            datasets: dictionary of data. we explicitly pass this in so that we don't have to waste runtime regenerating the same dataset over and over. 
    """
    # create config 
    config = ml_collections.ConfigDict()
    config.max_checkpts_to_keep = 2

    # Optimizer.
    config.optimizer = "adam"
    # config.optimizer = trial.suggest_categorical("optimizer", ["adam", "sgd"])
    config.learning_rate = trial.suggest_float('learning_rate', 3e-4, 3e-3, 
                                               log=True)
    # if config.optimizer == "sgd":
    #     config.momentum = trial.suggest_float('momentum', 0, 0.999) # upper bound is inclusive, and we want to exclude a momentum of 1 because that would yield no decay 

    # Data params that are used in training 
    config.output_steps = 4

    # Training hyperparameters.
    config.batch_size = 1 # variable currently not used
    config.epochs = 200
    config.log_every_epochs = 5
    config.eval_every_epochs = 5
    config.checkpoint_every_epochs = 10

    # GNN hyperparameters.
    config.model = 'MLPGraphNetwork'
    config.n_blocks = trial.suggest_int('n_blocks', 1, 6)
    config.share_params = False
    config.dropout_rate = trial.suggest_float('dropout_rate', 0, 0.2)
    config.skip_connections = False # This was throwing a broadcast error in add_graphs_tuples_nodes when this was set to True
    config.layer_norm = False # TODO perhaps we want to turn on later
    # config.activation = trial.suggest_categorical(
    #     'activation', ["relu", "elu", "leaky_relu"])
    config.activation = "leaky_relu"
    
    config.pred_x1 = True
    config.pred_x2 = True

    if config.pred_x1 and config.pred_x2:
        output_layer = 2
    else:
        output_layer = 1

    # choose the hidden layer feature size using powers of 2 
    config.edge_features = (
        2**trial.suggest_int("edge_mlp_1_power", 1, 3), # range 2 - 8; upper bound is inclusive
        2**trial.suggest_int("edge_mlp_2_power", 1, 3), # range 2 - 8
    )
    config.node_features = (
        2**trial.suggest_int("node_mlp_1_power", 1, 6), 
        # 2**trial.suggest_int("node_mlp_2_power", 1, 6), 
        output_layer) 
    # note the last feature size will be the number of features that the graph predicts
    config.global_features = None

    # generate a workdir 
    # TODO: check if we actually care about referencing this in the future or if we can just create a temp dir 
    workdir=os.path.join(CHECKPOINT_PATH, study_name, f"trial_{trial.number}")

    # run training 
    state, train_metrics, eval_metrics_dict = train_and_evaluate_with_data(config=config, workdir=workdir, datasets=datasets, trial=trial)
    
    # retrieve and return val loss (MSE)
    # print("eval_metrics_dict['val'].loss", eval_metrics_dict['val'].loss)
    # print("eval_metrics_dict['val'].compute()['loss']", eval_metrics_dict['val'].compute()['loss'])
    # print()
    return eval_metrics_dict['val'].compute()['loss']




In [6]:
def get_data_config():
    config = ml_collections.ConfigDict()

    config.n_samples=10_000
    config.input_steps=1
    config.output_delay=8 # predict 24 hrs into the future 
    config.output_steps=4
    config.timestep_duration=3 # equivalent to 3 hours
    # note a 3 hour timestep resolution would be 5*24/3=40
    # if the time_resolution is 120, then a sampling frequency of 3 would achieve a 3 hour timestep 
    config.sample_buffer = -1 * (config.input_steps + config.output_delay + config.output_steps - 1) # negative buffer so that our sample input are continuous (i.e. the first sample would overlap a bit with consecutive samples) 
        # number of timesteps strictly between the end 
        # of one full sample and the start of the next sample
    config.time_resolution=120 # the number of 
                # raw data points generated per time unit, equivalent to the 
                # number of data points generated per 5 days in the simulation
    config.init_buffer_samples=100
    config.train_pct=0.7
    config.val_pct=0.2
    config.test_pct=0.1
    config.K=36
    config.F=8
    config.c=10
    config.b=10
    config.h=1
    config.seed=42
    config.normalize=True
    config.fully_connected_edges=False

    return config

In [20]:
def prepare_study(study_name):
    # generate dataset 
    dataset_config = get_data_config()
    datasets = create_dataset(dataset_config)
    print_graph_fts(datasets['train']['inputs'][0][0])

    # get the objective function that reuses the pre-generated datasets 
    objective_partial = partial(objective, study_name=study_name, 
                                datasets=datasets)

    # run optimization study
    db_path = os.path.join(CHECKPOINT_PATH, study_name, "optuna_hparam_search.db")
    if not os.path.exists(os.path.join(CHECKPOINT_PATH, study_name)):
        os.makedirs(os.path.join(CHECKPOINT_PATH, study_name))

    study = optuna.create_study(
        study_name=study_name,
        storage=f'sqlite:///{db_path}', # generates a new db if it doesn't exist
        direction='minimize',
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5, 
            n_warmup_steps=50,
            ), 
        load_if_exists=True, 
    )
    
    return study, objective_partial

### hyperparameter tuning 

In [21]:
# get study
study13, objective_partial = prepare_study(study_name="hparam_study_13")

[I 2023-12-08 22:02:10,364] Using an existing study with name 'hparam_study_13' instead of creating a new one.


Number of nodes: 36
Number of edges: 180
Node features shape: (36, 2)
Edge features shape: (180, 1)
Global features shape: (1, 1)


In [14]:
study13.optimize(objective_partial, 
                n_trials=5-len(study13.trials), 
                n_jobs=1)

[I 2023-12-08 21:46:22,433] Trial 0 finished with value: 0.7255893349647522 and parameters: {'learning_rate': 0.00034446252948483, 'n_blocks': 3, 'dropout_rate': 0.14502811264372006, 'activation': 'relu', 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 6}. Best is trial 0 with value: 0.7255893349647522.
[W 2023-12-08 21:59:26,337] Trial 1 failed with parameters: {'learning_rate': 0.0004068974819787106, 'n_blocks': 3, 'dropout_rate': 0.19502952548694075, 'activation': 'relu', 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 3} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/h.lu/Documents/_code/_research lorenz code/lorenzGNN/lorenzvenv/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/py/716jgpq538x2wf_s8t__x9b80000gn/T/ipykernel_5672/3769017719.py", line 64, in objective
    state, train_metrics, eval_metrics_dic

KeyboardInterrupt: 

restrict activation to just leaky_relu since we know that worked best

In [22]:
study13.optimize(objective_partial, 
                n_trials=5-len(study13.trials), 
                n_jobs=1)

[I 2023-12-08 22:52:56,142] Trial 3 finished with value: 0.522697925567627 and parameters: {'learning_rate': 0.0009083317205736135, 'n_blocks': 5, 'dropout_rate': 0.03140123960411469, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 3}. Best is trial 3 with value: 0.522697925567627.
[I 2023-12-08 23:09:15,099] Trial 4 finished with value: 0.6420753598213196 and parameters: {'learning_rate': 0.0009020997227325296, 'n_blocks': 1, 'dropout_rate': 0.13378871299460265, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 5}. Best is trial 3 with value: 0.522697925567627.


ok it's weird that we are occasionally still getting huge errors like in trial 0

In [23]:
study13.trials

[FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.7255893349647522], datetime_start=datetime.datetime(2023, 12, 8, 21, 7, 4, 345436), datetime_complete=datetime.datetime(2023, 12, 8, 21, 46, 22, 278917), params={'learning_rate': 0.00034446252948483, 'n_blocks': 3, 'dropout_rate': 0.14502811264372006, 'activation': 'relu', 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 6}, user_attrs={}, system_attrs={}, intermediate_values={0: 7.669472694396973, 5: 8.407955169677734, 10: 0.9251963496208191, 15: 0.7105982899665833, 20: 0.7259289622306824, 25: 0.7351808547973633, 30: 0.7290900945663452, 35: 0.7277955412864685, 40: 0.7036943435668945, 45: 0.7328064441680908, 50: 0.7129110097885132, 55: 0.7416166663169861, 60: 0.7237672805786133, 65: 0.756308913230896, 70: 0.720796525478363, 75: 0.73613041639328, 80: 0.7338652610778809, 85: 0.7686334252357483, 90: 0.7413269281387329, 95: 0.7322515249252319, 100: 0.725441575050354, 105: 0.715444028377533, 110: 0.744888961315155,

let's plot the best trial predictions so far

In [24]:
datasets = create_dataset(get_data_config())

In [26]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=0, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="val", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_days=60,
)

KeyError: 'optimizer'

In [None]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=0, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="train", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_days=60,
)

visualize tuning and loss landscape

In [None]:
fig = optuna.visualization.plot_intermediate_values(study13)
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study13, params=['learning_rate', 'dropout_rate'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study13, params=['learning_rate', 'node_mlp_1_power'])
fig.show()

In [28]:
study13.optimize(objective_partial, 
                n_trials=50-len(study13.trials), 
                n_jobs=1)

[I 2023-12-09 00:07:25,437] Trial 6 finished with value: 0.6111515760421753 and parameters: {'learning_rate': 0.0025220515143415056, 'n_blocks': 1, 'dropout_rate': 0.11863238514538246, 'edge_mlp_1_power': 3, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 5}. Best is trial 3 with value: 0.522697925567627.
[I 2023-12-09 08:56:47,306] Trial 7 finished with value: 0.5996211767196655 and parameters: {'learning_rate': 0.0006209458222331137, 'n_blocks': 3, 'dropout_rate': 0.07086806451335935, 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 3, 'node_mlp_1_power': 4}. Best is trial 3 with value: 0.522697925567627.
[I 2023-12-09 10:04:51,498] Trial 8 finished with value: 0.49293312430381775 and parameters: {'learning_rate': 0.000353045843679119, 'n_blocks': 5, 'dropout_rate': 0.012070411228506696, 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 2}. Best is trial 8 with value: 0.49293312430381775.
[I 2023-12-09 10:49:39,217] Trial 9 finished with value: 0.5516920685768127 and parameters:

plot best trial predictions

In [None]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=0, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="val", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_all=False, # if false, only plot the first 100 
)

In [None]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=3, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="val", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_all=False, # if false, only plot the first 100 
)

visualize trials excluding the really bad ones

In [None]:
study_vis = remove_bad_trials(study13)

In [None]:
fig = optuna.visualization.plot_intermediate_values(study_vis)
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'dropout_rate'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'edge_mlp_1_power'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'node_mlp_1_power'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'optimizer'])

fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['optimizer', 'activation'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['activation', 'node_mlp_1_power'])
fig.show()

In [None]:
study13.optimize(objective_partial, 
                n_trials=100-len(study13.trials), 
                n_jobs=1)

In [None]:
study13.optimize(objective_partial, 
                n_trials=200-len(study13.trials), 
                n_jobs=1)

plot best trial predictions

In [None]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=0, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="val", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_all=False, # if false, only plot the first 100 
)

visualize trials excluding the really bad ones

In [None]:
study_vis = remove_bad_trials(study13)

In [None]:
study_vis.trials

In [None]:
fig = optuna.visualization.plot_intermediate_values(study_vis)
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'dropout_rate'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'edge_mlp_1_power'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'node_mlp_1_power'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['learning_rate', 'optimizer'])

fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['optimizer', 'activation'])
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis, params=['activation', 'node_mlp_1_power'])
fig.show()

In [None]:
study13.optimize(objective_partial, 
                n_trials=200-len(study13.trials), 
                n_jobs=1)

In [None]:
plot_predictions(
    config=get_best_trial_config(study=study13),
    workdir=get_best_trial_workdir(study=study13), # for loading checkpoints 
    plot_ith_rollout_step=0, # 0 indexed # for this study, we have a 4-step rollout 
    # dataset,
    # preds,
    # timestep_duration,
    # n_rollout_steps,
    #  total_steps,
    node=0, # 0-indexed 
    plot_mode="val", # i.e. "train"/"val"/"test"
    datasets=datasets,
    plot_all=False, # if false, only plot the first 100 
)

In [None]:
study_vis = remove_bad_trials(study13)

In [None]:
study_vis.trials

In [None]:
fig = optuna.visualization.plot_intermediate_values(study_vis)
fig.show()

In [None]:
# plot the estimated accuracy surface over hyperparameters:
fig = optuna.visualization.plot_contour(study_vis)
fig.show()