In [None]:
!apt install swig cmake -y

In [None]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt

In [None]:
!apt-get update -y
!apt-get install -y python3-opengl
!apt install ffmpeg -y
!apt install xvfb -y
!pip3 install pyvirtualdisplay

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
# virtual_display.start()

In [None]:
# virtual_display.start()

In [1]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [None]:
import gymnasium as gym

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

budget = 10_000

# Create environment
env = gym.make("LunarLander-v2", render_mode="rgb_array")

# Instantiate the agent
model = DQN("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
# model.learn(total_timesteps=budget, progress_bar=True)


In [None]:
# Save the agent
model.save("dqn_lunar")
del model  # delete trained model to demonstrate loading

# Load the trained agent
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
model = DQN.load("dqn_lunar", env=env)

# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

In [None]:
mean_reward, std_reward

In [None]:
# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(10000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")
vec_env.close()

In [None]:
env = gym.make("LunarLander-v3", render_mode="rgb_array")

In [2]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

from typing import Any, Dict
import torch
import torch.nn as nn

import plotly
import tensorboard

from stable_baselines3.common.callbacks import EvalCallback

In [3]:
%load_ext tensorboard

In [None]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(5e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 10
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = "LunarLander-v2"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [None]:
def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
    
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1, log=True) # The learning rate, it can be a function of the current progress remaining (from 1 to 0)

    buffer_size = 10 ** trial.suggest_int('buffer_size', 3, 7) # size of the replay buffer
    
    learning_starts =  2 ** trial.suggest_int('learning_starts', 3, 10) # how many steps of the model to collect transitions for before learning starts
    
    batch_size = 2 ** trial.suggest_int('batch_size', 3, 10) #  Minibatch size for each gradient update
    
    tau = trial.suggest_float('tau', 0.75, 1.0) # the soft update coefficient (“Polyak update”, between 0 and 1) default 1 for hard update
    
    gamma =  trial.suggest_float('gamma', 0.9, 0.9999) # the discount factor
    
    train_freq = trial.suggest_int('train_freq', 4, 200) # Update the model every train_freq steps. Alternatively pass a tuple of frequency and unit like (5, "step") or (2, "episode").
    
    gradient_steps = trial.suggest_int('gradient_steps', 1, 4) # How many gradient steps to do after each rollout (see train_freq) Set to -1 means to do as many gradient steps as steps done in the environment during the rollout.

    target_update_interval = 10 ** trial.suggest_int('target_update_interval', 3, 7) # update the target network every target_update_interval environment steps.
    
    exploration_fraction = trial.suggest_float('exploration_fraction', 0.05, 0.5) # fraction of entire training period over which the exploration rate is reduced
    
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 5) # The maximum value for the gradient clipping
            
    net_arch = trial.suggest_categorical('net_arch', ['tiny', 'small'])
    activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu'])

    # display true values
    trial.set_user_attr('gamma', gamma)

    net_arch = [64] if net_arch == 'tiny' else [128, 128]
    
    activation_fn = {'tanh': nn.Tanh, 'relu': nn.ReLU}[activation_fn]
    
    return {
        'exploration_fraction': exploration_fraction,
        'target_update_interval': target_update_interval,
        'gradient_steps': gradient_steps,
        'train_freq': train_freq,
        'tau': tau,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'max_grad_norm': max_grad_norm,
        'policy_kwargs': {
            'net_arch': net_arch,
            'activation_fn': activation_fn
        }}
    
    

In [None]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(**sample_dqn_params(trial))
    print(kwargs)
    # Create the RL model
    model = DQN(**kwargs)
    # Create eval envs
    eval_envs = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, )

    eval_callback = TrialEvalCallback(eval_envs, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=True, verbose=0)

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback, progress_bar=True)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [None]:
# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

In [None]:
trial.number

In [13]:
kwargs = {
    'exploration_fraction': 0.7, 
    # 'target_update_interval': 10000, 
    # 'gradient_steps': 3, 
    # 'train_freq': 98, 
    'tau': 0.005, 
    'batch_size': 128, 
    'buffer_size': 10000, 
    'gamma': 0.99, 
    'learning_rate': 1e-4, 
    'max_grad_norm': 1.0, 
    'policy_kwargs': {'net_arch': [128, 128], 'activation_fn': nn.ReLU},
}

In [None]:
model = DQN(**kwargs)

In [None]:
best_kwargs = trial.params

In [None]:
best_kwargs['activation_fn'] = nn.Tanh

In [None]:
best_kwargs['net_arch'] = [64]

In [None]:
best_kwargs['learning_rate'] = 1.0999765429841484e-05

In [None]:
best_kwargs

In [None]:
del best_kwargs['lr']

In [None]:
policy_kwargs = {'net_arch': [64],
 'activation_fn': torch.nn.modules.activation.Tanh,}

In [None]:
best_kwargs['policy_kwargs'] = policy_kwargs

In [None]:
del best_kwargs['net_arch']

In [None]:
del best_kwargs['activation_fn']

In [14]:
vec_env = make_vec_env('LunarLander-v2', n_envs=6)

In [15]:
try:
    del test_model
except NameError:
    pass
test_model = DQN('MlpPolicy', env=vec_env, **kwargs, verbose=0, tensorboard_log='./lunarlander_dqn_tensorboard_logs/')

In [19]:
test_model.learn(total_timesteps=2_000_000, progress_bar=True, tb_log_name='final_run21')

In [20]:
mean, std = evaluate_policy(model=test_model, env=vec_env, n_eval_episodes=10, deterministic=True)
print(f'Mean: {mean:.2f}, Std: {std:.2f}')

In [None]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')

In [None]:
try:
    del base_model
except NameError:
    pass
    
base_model = DQN('MlpPolicy', env=env, verbose=0, tensorboard_log='./lunarlander_dqn_tensorboard_logs/', **kwargs)

In [None]:
base_model.learn(total_timesteps=200_000, progress_bar=True, tb_log_name='first_run')

In [None]:
mean, std = evaluate_policy(model=base_model, env=env, n_eval_episodes=10, deterministic=True)
print(f'Mean: {mean:.2f}, Std: {std:.2f}')

In [21]:
%tensorboard --logdir ./lunarlander_dqn_tensorboard_logs/ --host=0.0.0.0

In [11]:
vec_env = test_model.get_env()
obs = vec_env.reset()
for i in range(10000):
    action, _states = test_model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")
vec_env.close()

In [None]:
base_model.tau