In [1]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

from torch import nn

import numpy as np

In [15]:
!pip install plotly

In [2]:
vec_env = make_vec_env("CartPole-v1", n_envs=4)

In [3]:
budget = 20_000

model = A2C("MlpPolicy", "CartPole-v1", verbose=1, seed=8, device='cpu')
model.learn(total_timesteps=budget, progress_bar=True)

In [4]:
mean, std = evaluate_policy(model, vec_env, n_eval_episodes=50, deterministic=True)
print(f'Mean reward = {mean:.3f}, Std of reward = +/-{std:.3f}')

In [5]:
policy_kwargs = dict(
    net_arch=[
        dict(vf=[64, 64], pi=[64, 64])
    ],
    activation_fn=nn.Tanh,
)

hyperparameters = dict(
    n_steps=1024,
    learning_rate=1e-3,
    gamma=0.99,
    max_grad_norm=1,
    ent_coef=0.0001,
)

In [6]:
model_tuned = A2C("MlpPolicy", "CartPole-v1", verbose=1, **hyperparameters, device='cpu').learn(total_timesteps=budget, progress_bar=True)

In [7]:
vec_env = make_vec_env("CartPole-v1", n_envs=4)
mean, std = evaluate_policy(model_tuned, vec_env, n_eval_episodes=50, deterministic=True)
print(f'Mean reward = {mean:.3f}, Std of reward = +/-{std:.3f}')

In [8]:
# obs = vec_env.reset()
# while True:
#     action, _states = model_tuned.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render("human")

In [9]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

from typing import Any, Dict
import torch
import torch.nn as nn


from stable_baselines3.common.callbacks import EvalCallback

In [10]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = "CartPole-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [11]:
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    # discount between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float('gamma', 0.0001, 0.1, log=True)

    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5.0, log=True)

    n_steps = 2 ** trial.suggest_int('exponent_n_steps', 3, 10)

    # learning rate between 1e-5, 1
    learning_rate = trial.suggest_float('lr', 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])

    # display true values
    trial.set_user_attr('gamma', gamma)
    trial.set_user_attr('n_steps', n_steps)

    net_arch = {'pi': [64], 'vf': [64]} if net_arch == 'tiny' else {'pi': [64, 64], 'vf': [64, 64]}
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    return {
        'n_steps': n_steps,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'max_grad_norm': max_grad_norm,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn
        }}
    
    

In [12]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [13]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    ### YOUR CODE HERE
    # TODO: 
    # 1. Sample hyperparameters and update the default keyword arguments: `kwargs.update(other_params)`
    # 2. Create the evaluation envs
    # 3. Create the `TrialEvalCallback`

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(**sample_a2c_params(trial))
    
    # Create the RL model
    model = A2C(**kwargs, device='cpu')

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, )
    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=True, verbose=0)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback, progress_bar=True)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [14]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

In [22]:
import pandas as pd
import plotly
import matplotlib.pyplot as plt

In [25]:
study = pd.read_csv('study_results_a2c_cartpole.csv', index_col=0)

# fig1 = plt.hist(study)
# fig2 = plt.hist(study, bins='auto')

# fig1.show()
# fig2.show()

In [26]:
study[:10]

In [28]:
model.policy