<center><h2>Sorbonne université</h2></center>
<center><h4>IAR - Intelligence artificielle pour la robotique</h4></center>
<center><h4>M2 Artificial Intelligence</h4></center>
<center><h1>Apprentissage par renforcement profond</h1></center>
<center><h3>LunarLander-v2</h3></center>
<br />
<center><h4>Thomas CORCORAL - <a href="https://www.linkedin.com/in/thomas-corcoral/?locale=en_US">linkedIn</a></h4></center>

# 1. Installation

## 1.1 git clone

In [None]:
#!git clone https://github.com/ThomasCorcoral/rl-baselines3-zoo

In [None]:
#!mv rl-baselines3-zoo/* ./

In [None]:
#!rm -r rl-baselines3-zoo/

In [None]:
!mkdir data
!mkdir data/policies

## 1.2 pip install

In [None]:
!pip install -r requirements.txt

Some corrections to support different environments

In [None]:
!pip3 install Box2D
!pip3 install box2d-py
!pip3 install gym[all]
!pip3 install gym[Box_2D] # To support all envs (some problems with Box_2D on Kaggle)
!pip install sb3-contrib
!pip install pyglet
!pip install huggingface_hub
!pip install huggingface_sb3

## 1.3 import

In [None]:
import gym
import numpy as np

from stable_baselines3 import PPO, SAC
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

from typing import Any
from typing import Dict

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

import torch
import torch.nn as nn

# 2. Premiers essais

In [None]:
!python train.py --algo DQN --env LunarLander-v2

In [None]:
!cp logs/dqn/LunarLander-v2_1/best_model.zip ./data/policies/LunarLander-v2#dqn#dqn1.zip
!python sb3_evaluator.py
!rm ./data/policies/LunarLander-v2#dqn#dqn1.zip

In [None]:
!python train.py --algo PPO --env LunarLander-v2

In [None]:
!cp logs/ppo/LunarLander-v2_1/best_model.zip ./data/policies/LunarLander-v2#ppo#ppo1.zip
!python sb3_evaluator.py
!rm ./data/policies/LunarLander-v2#ppo#ppo1.zip

In [None]:
!python train.py --algo PPO --env LunarLanderContinuous-v2

In [None]:
!cp logs/ppo/LunarLanderContinuous-v2_1/best_model.zip ./data/policies/LunarLanderContinuous-v2#ppo#ppo1.zip
!python sb3_evaluator.py
!rm ./data/policies/LunarLanderContinuous-v2#ppo#ppo1.zip

In [None]:
!python train.py --algo SAC --env LunarLanderContinuous-v2

In [None]:
!cp logs/ppo/LunarLanderContinuous-v2_1/best_model.zip ./data/policies/LunarLanderContinuous-v2#sac#sac1.zip
!python sb3_evaluator.py
!rm ./data/policies/LunarLanderContinuous-v2#sac#sac1.zip

# 3. Optimisation des paramètres

## 3.1 Optimisation avec les scripts sb3

In [None]:
!python train.py --algo ppo --env LunarLander-v2 -n 100000 -optimize --n-trials 10000 --n-jobs 2 --sampler tpe --pruner median

In [None]:
!python train.py --algo sac --env LunarLanderContinuous-v2 -n 100000 -optimize --n-trials 10000 --n-jobs 2 --sampler tpe --pruner median

## 3.2 Optimisation LunarLander-v2

In [None]:
N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = 100000
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3
ENV_ID = "LunarLander-v2"
DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [None]:
def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for PPO hyperparameters."""
    n_steps = 2 ** trial.suggest_int("n_steps", 3, 11)
    batch_size = 2 ** trial.suggest_int("batch_size", 5, 9)
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.83, 0.85, 0.87, 0.9, 0.93, 0.95, 0.98])
    gamma = trial.suggest_float("gamma", 0.9, 0.9999, log=True)
    n_epochs = trial.suggest_int("n_epochs", 4, 20)
    ent_coef = trial.suggest_float("ent_coef", 0, 0.06)
    learning_rate =  trial.suggest_float("lr", 1e-7, 0.001, log=True)
    clip_range = trial.suggest_float("clip_range", 0, 1)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5, log=True)
    vf_coef = trial.suggest_float("vf_coef", 0, 1)
    
    log_std_init = trial.suggest_float("log_std_init", -4, -1)
    ortho_init = False
    activation_fn = nn.ReLU
    which_net_arch = trial.suggest_categorical("net_arch", ["big", "small"])
    
    if which_net_arch == "big":
        net_arch=[dict(pi=[256, 256], vf=[256, 256])]
    else:
        net_arch=[dict(pi=[64, 64], vf=[64, 64])]
    activation_fn = nn.ReLU
    ortho_init = False
    
    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("gae_lambda_", gae_lambda)
    trial.set_user_attr("n_steps", n_steps)

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "gae_lambda": gae_lambda, 
        "gamma": gamma,
        "n_epochs": n_epochs,
        "ent_coef": ent_coef,
        "learning_rate": learning_rate,
        "clip_range": clip_range,
        "max_grad_norm": max_grad_norm,
        "vf_coef": vf_coef,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
            "ortho_init": ortho_init,
        },
    }

In [None]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sample_ppo_params(trial))
    model = PPO(**kwargs)
    eval_env = gym.make(ENV_ID)
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    if nan_encountered:
        return float("nan")
    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()
    return eval_callback.last_mean_reward

In [None]:
torch.set_num_threads(1)
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    study.optimize(objective, n_trials=N_TRIALS)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
print("  User attrs:")
for key, value in trial.user_attrs.items():
    print("    {}: {}".format(key, value))

## 3.3 Optimisation LunarLanderContinuous-v2

In [None]:
N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = 50000
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3
ENV_ID = "LunarLanderContinuous-v2"
DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [None]:
def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for SAC hyperparameters."""
    # policy, env 
    
    # (Union[float, Callable[[float], float]]) – learning rate for adam optimizer,  
    # the same learning rate will be used for all networks (Q-Values, Actor and Value 
    # function) it can be a function of the current progress remaining (from 1 to 0)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    # INT - size of the replay buffer
    buffer_size = 2 ** trial.suggest_int("buffer_size", 17, 21)
    # INT - how many steps of the model to collect transitions for before learning starts
    learning_starts = 100 * trial.suggest_int("learning_starts", 1, 50)
    # INT - Minibatch size for each gradient update
    batch_size = 256
    # FLOAT - the soft update coefficient (“Polyak update”, between 0 and 1)
    tau = trial.suggest_float("tau", 0.0001, 0.2, log=True)
    # FLOAT - the discount factor
    gamma = trial.suggest_float("gamma", 0.9, 0.9999, log=True)
    # INT - Update the model every train_freq steps
    train_freq = trial.suggest_int("train_freq", 1, 10)
    # INT - How many gradient steps to do after each rollout
    gradient_steps = trial.suggest_int("gradient_steps", 1, 10)
    # FLOAT - Entropy regularization coefficient
    # ent_coef = trial.suggest_float("ent_coef", 0.05, 0.15, log=True)
    # INT - update the target network every target_network_update_freq gradient steps.
    # target_update_interval = trial.suggest_int("target_update_interval", 1, 100)
    # FLOAT - target entropy when learning ent_coef
    # target_entropy = trial.suggest_float("target_entropy", 0.05, 0.15, log=True)
    # BOOL - Whether to use generalized State Dependent Exploration (gSDE) instead of action noise exploration
    use_sde = True
    # INT - Sample a new noise matrix every n steps when using gSDE
    sde_sample_freq = trial.suggest_int("sde_sample_freq", 1, 10)
    # BOOL - Whether to use gSDE instead of uniform sampling during the warm up phase (before learning starts)
    use_sde_at_warmup = trial.suggest_categorical("use_sde_at_warmup", [True, False])
    # Display true values
    trial.set_user_attr("gamma_", gamma)

    return {
        "learning_rate" : learning_rate, 
        "buffer_size" : buffer_size, 
        "learning_starts" : learning_starts,
        "batch_size" : batch_size,
        "tau" : tau,
        "gamma" : gamma,
        "train_freq" : train_freq,
        "gradient_steps" : gradient_steps,
        "use_sde" : use_sde,
        "sde_sample_freq" : sde_sample_freq,
        "use_sde_at_warmup" : use_sde_at_warmup,
    }

In [None]:
class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sample_sac_params(trial))
    model = SAC(**kwargs)
    eval_env = gym.make(ENV_ID)
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    if nan_encountered:
        return float("nan")
    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()
    return eval_callback.last_mean_reward

In [None]:
torch.set_num_threads(1)
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

In [None]:
try:
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
print("  User attrs:")
for key, value in trial.user_attrs.items():
    print("    {}: {}".format(key, value))

# 4. Modification dynamique des hyperparamètres

## 4.1 Learning rate pour PPO

In [None]:
def linear_schedule(initial_value):
    if isinstance(initial_value, str):
        initial_value = float(initial_value)

    def func(progress):
        return progress * initial_value

    return func

def lrsched():
    def reallr(progress):
        lr = 0.0004
        if progress < 0.8:
            lr = 0.0003
        if progress < 0.6:
            lr = 0.0002
        if progress < 0.4:
            lr = 0.0001
        if progress < 0.2:
            lr = 0.00005
        return lr
    return reallr

In [None]:
# Create the environment
env_id = "LunarLander-v2"
n_envs = 32
env = make_vec_env(env_id, n_envs=n_envs)
eval_envs = make_vec_env(env_id, n_envs=4)

eval_freq = int(1e5)
eval_freq = max(eval_freq // n_envs, 1)

eval_callback = EvalCallback(
    eval_envs,
    best_model_save_path="./",
    eval_freq=eval_freq,
    n_eval_episodes=10,
)

model_ppo = PPO(
    "MlpPolicy",
    env,
    n_steps=2048,
    batch_size=256,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    n_epochs=8,
    learning_rate=lrsched(),
    verbose=1,
    policy_kwargs=dict(net_arch=[dict(pi=[64, 64, 64, 64], vf=[64])]),
)

In [None]:
try:
    model_ppo.learn(total_timesteps=5000000, callback=eval_callback)
except KeyboardInterrupt:
    pass

In [None]:
!mv best_model.zip ./data/policies/LunarLander-v2#ppo#Corcoral_Kostadinovic.zip

In [None]:
!python sb3_evaluator.py

In [None]:
!mv ./data/policies/LunarLander-v2#ppo#Corcoral_Kostadinovic.zip ./rl-trained-agents

## 4.2 Learning rate pour SAC

In [None]:
def linear_schedule(initial_value):

    if isinstance(initial_value, str):
        initial_value = float(initial_value)

    def func(progress):
        return progress * initial_value

    return func

def lrsched():
    def reallr(progress):
        lr = 0.005
        if progress < 0.8:
            lr = 0.003
        if progress < 0.6:
            lr = 0.001
        if progress < 0.4:
            lr = 0.0005
        if progress < 0.2:
            lr = 0.0001
        return lr
    return reallr

In [None]:
# Create the environment
env_id = "LunarLanderContinuous-v2"
n_envs = 32
env = make_vec_env(env_id, n_envs=n_envs)
eval_envs = make_vec_env(env_id, n_envs=4)

eval_freq = int(1e5)
eval_freq = max(eval_freq // n_envs, 1)

eval_callback = EvalCallback(
    eval_envs,
    best_model_save_path="./",
    eval_freq=eval_freq,
    n_eval_episodes=10,
)

model_sac = SAC(
    "MlpPolicy",
    env,
    train_freq=9,
    gradient_steps=8,
    sde_sample_freq=5,
    use_sde_at_warmup=False,
    use_sde=False,
    batch_size=256,
    buffer_size=262144,
    ent_coef='auto',
    gamma=0.9877986493994404,
    tau=0.0039251709137456195,
    learning_rate=lrsched(),
    learning_starts=2800,
    verbose=1,
    policy_kwargs=dict(net_arch=[400, 300]),
)

In [None]:
try:
    model_sac.learn(total_timesteps=5000000, callback=eval_callback)
except KeyboardInterrupt:
    pass

In [None]:
!mv best_model.zip ./data/policies/LunarLanderContinuous-v2#ppo#Corcoral_Kostadinovic.zip

In [None]:
!python sb3_evaluator.py

In [None]:
!mv ./data/policies/LunarLander-v2#ppo#Corcoral_Kostadinovic.zip ./rl-trained-agents