# **PPO Notebook (StableBaselines3)** #

## (B) Setups ##

### (B1) Imports ###

In [None]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import (
    notebook_login,
)

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

## (C) Implementation ##

In [None]:
env_id = "LunarLander-v2"
env = make_vec_env(env_id, n_envs=16)
"""
adding env_kwargs={"render_mode": "human"} makes the renderer constantly flickering, presumably
through the 16 sub-vec_envs

Official examples from
https://stable-baselines3.readthedocs.io/en/master/guide/examples.html
only enjoy the agent AFTER the training
"""

In [None]:
model_name = "ppo-LunarLander-v2"

In [None]:
# my CleanRL hyperparameters
# performs worse than course's hyperparameters: lower mean reward, slower episode/time
"""
model = PPO(
    policy="MlpPolicy", 
    env=env, 
    learning_rate=0.0003, 
    n_steps=512, 
    batch_size=256, 
    n_epochs=8, 
    gamma=0.995, 
    gae_lambda=0.95, 
    clip_range=0.2, 
    clip_range_vf=0.2, 
    ent_coef=0.01, 
    vf_coef=0.5, 
    max_grad_norm=0.5, 
    verbose=1, 
    tensorboard_log="./runs/"
)
"""


In [None]:
# course hyperparameters
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
    tensorboard_log="./runs/", 
)
"""
The author of the course "got a mean reward of 200.20 +/- 20.80 
after training for 1 million steps, which means" 
he failed to pass ??!

see eval
"""

In [None]:
model.learn(1_000_000)

## (D) Evaluate ##

In [None]:
eval_env = Monitor(gym.make(env_id, render_mode="human")) # render_mode="human" slows down the eval process (eval finishes only when render finishes)

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

eval_env.close() # else render window will not close correctly and force-closing raises an error

print(f"mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

# tensorboard shows ep_rew_mean ~ 150 at the end of training, 
# evaluate_policy() results in mean_reward=252.16 +/- 24.04
# Probably, tensorboard does not add the final reward (+100 for successful landing)
# ^ not true, the ep_len_mean hovered just below 1000

# Not exactly sure what deterministic=True does 
# # (in the source code, it calls model.predict() with deterministic=True, but what that 
# in turn does??)
# The rendering shows the agent keeps firing the engines at random after landing, 
# a behaviour not seen when deterministic=True. It keeps it until truncation
# So I conclude that deterministic=True sets the policy to argmax instead of softmax
# https://stable-baselines3.readthedocs.io/en/master/common/evaluation.html#module-stable_baselines3.common.evaluation



## (E) Save / Load ##

In [None]:
model.save("ppo" + env_id)


In [None]:
model = PPO.load("ppo" + env_id)

## (F) Push ##

In [None]:
model_architecture = "PPO"
repo_id = "Rudolph314/sb3ppo-LunarLander-v2"
commit_message = "Upload PPO LunarLander-v2 trained agent"

eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

package_to_hub(
    model=model,  # Our trained model
    model_name=model_name,  # The name of our trained model
    model_architecture=model_architecture,  # The model architecture we used: in our case PPO
    env_id=env_id,  # Name of the environment
    eval_env=eval_env,  # Evaluation Environment
    repo_id=repo_id,  # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
    commit_message=commit_message,
)