# **PPO Notebook (CleanRL)** #

This Notebook is a combination of CleanRL's [blog post](https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/) and its accompanying video tutorial, part 1. 

I have taken out some parts for separate treatment/introduction. 

*renamed variables*
+ the plural / singular
+ `exp_name` -> `algo`
+ `num_steps` -> `train_interval`
+ `done` -> `rstt` 
    + the differnce being if t=8 is done, then t=9 is restarting (rstt) and s_9 is a initial state
+ `b_inds` -> `b_idxs`
+ `v_loss_unclipped` -> `vunclipped_loss`, `v_loss_clipped` -> `vclipped_loss`

---
## (A/B) Set Up ##

### (B) Imports ###

In [1]:
# misc
import random
import numpy as np
import torch
import time
import datetime
import os
# (B2) argparse
import os
# (B3,4) tensorboard & wandb
from torch.utils.tensorboard import SummaryWriter
# (C) Implementation
# (1) vector environment
import gymnasium as gym
# (2) agent layer initialization
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

### (B2) args ###

In [2]:
class Args(dict): # https://stackoverflow.com/questions/4984647/accessing-dict-keys-like-an-attribute?page=1&tab=scoredesc#tab-top
        def __init__(self, *args, **kwargs):
            super(Args, self).__init__(*args, **kwargs)
            self.__dict__ = self
# may reference stablebaselines 3 's hyperparameters
# https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
# some are not implemented by me
args = {
    "algo": "ppo", 
    "env_id": "LunarLander-v2", 
    "learning_rate": 3e-3, # StableBaselines3 uses 3e-3, CleanRL uses 2.5e-4 (too small when I tried)
    "learning_rate_final": 1e-4, 
    "seed": 2, 
    "total_timesteps": 500_000, # Unit 1 trains for 1e6 total_timestep, might have overfitted
    "capture_video": True,
    "video_trigger_episode_n": 20, # dor the 0-th env, so the actual episode_n is this x (num_envs)
    # algorithm-specific
    "num_envs": 16, #4 
    "train_interval": 512, # should be comparable to episodic length (?)
    "anneal_lr": True, 
    "gamma": 0.995, 
    "lambda_gae": 0.95, # GAE lambda, controls propagate like TD(λ)
    "clip_coef": 0.2, # for clipping both policy loss and value loos (detail#8, detail#9)
    "clip_vloss": False, 
    "ent_coef": 0.01, # weight for entropy, policy loss as 1, during optimization (detail#10)
    "ent_coef_final": 0.0, 
    "vf_coef": 0.5, # same
    "max_grad_norm": 0.5, 
    "update_epochs": 8, # train over memory this many times at each iteration
    "norm_adv": True, # normalize GAE
    
    "torch_deterministic": True, 
    "cuda": True, 

    # filled in runtime
    "batch_size": 0, 
    "num_minibatches": 32, 
    "minibatch_size": 0, 
    "num_iterations": 0, 
}
args = Args(args)
args.batch_size = int(args.num_envs * args.train_interval)
args.minibatch_size = int(args.batch_size // args.num_minibatches)
args.num_iterations = args.total_timesteps // args.batch_size


In [None]:
start_datetime = datetime.datetime.now().strftime("%m%d_%H%M")
run_name = f"{args.env_id}__{args.algo}__{args.seed}__{start_datetime}"

print(f"start_datetime = {start_datetime}")

### (B3) Hardware ###

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

print(f"device_name = {torch.cuda.get_device_name(device)}")

### (B4) Tensorboard ###

In [None]:
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters", 
    "|param|value|\n|-|-|\n%s" % "\n".join(f"|{key}|{val}" for key, val in args.items())
)

### (B5) seeding ###

In [3]:
# https://pytorch.org/docs/stable/notes/randomness.html
random.seed(args["seed"])
np.random.seed(args["seed"])
torch.manual_seed(args["seed"])
torch.backends.cudnn.deterministic = args["torch_deterministic"]
# episodic rewards, for example, varies slightly during training on Tensorboard, but is identical after training finishes (due to smoothing?)
# ^ not really smoothing, upon closer inspection, most are identical, but some are just not ???

## (C) Implementation ##

### (1) vector environment ###

In [5]:
# detail#1
def make_env(env_id, idx, capture_video, run_name, episode_trigger_n=args.video_trigger_episode_n):
    # idx is the index in the vector environment
    # run_name = f'{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}'
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="human")
            env = gym.wrappers.RecordVideo(
                env, 
                f"videos/{run_name}", 
                #episode_trigger=lambda n: n%episode_trigger_n == 0, 
            )
        else: 
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env
    return thunk

In [6]:
envs = gym.vector.SyncVectorEnv(
    [make_env(args.env_id, idx, args.capture_video, run_name) for idx in range(args.num_envs)]
)

#### (1a) test_vec_env ####

In [7]:
def test_vec_env(num_envs=4, capture_video=True):
    envs = gym.vector.SyncVectorEnv(
        [make_env("CartPole-v1", idx, capture_video, "test_vec_env") for idx in range(num_envs)]
    )
    
    global_step = 0
    global_episode = 0
    
    obs = envs.reset()
    for _ in range(500):
        global_step += num_envs
        
        actions = envs.action_space.sample()
        obs, rewards, terminations, truncations, infos = envs.step(actions)
        
        if "final_info" in infos:
            for idx, info in enumerate(infos["final_info"]): # 'final_info' is an array
                if info and "episode" in info: # "episode" only if info is not None and wrappers.RecordEpisodeStatistics is on
                    print(f"global_episode={global_episode}, env_idx={idx}")
                    print(f"global_step={global_step}, \nepisodic_return={info['episode']['r']}")
                    global_episode += 1
    envs.close()

    return envs

In [8]:
#test_vec_env(args.num_envs)

### (2) agent -- initialization, layzers ###

In [9]:
# nn.init https://pytorch.org/docs/stable/nn.init.html
# detail#2
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        
        hidden_size = 64
        self.critic = nn.Sequential(
            layer_init(nn.Linear(
                np.array(envs.single_observation_space.shape).prod(), 
                hidden_size)
            ), 
            nn.Tanh(), 
            layer_init(nn.Linear(
                hidden_size, 
                hidden_size
            )), 
            nn.Tanh(), 
            layer_init(nn.Linear(
                hidden_size, 
                1
            ), std=1.0) # std=1.0 draws on domain knowledge
        )
        self.actor = nn.Sequential( # returns not normalized, not exponential 'probabilities'
            layer_init(nn.Linear(
                np.array(envs.single_observation_space.shape).prod(), 
                hidden_size, 
            )), 
            nn.Tanh(), 
            layer_init(nn.Linear(
                hidden_size, 
                hidden_size
            )), 
            nn.Tanh(), 
            layer_init(nn.Linear(
                hidden_size, 
                envs.single_action_space.n # gym.spaces.Discrete
            ), std=0.01) # soft-max?
        )
    
    def get_value(self, obs):
        return self.critic(obs)

    # good practice to bundle actor inference with critic inference
    def get_action_and_value(self, obs, action=None):
        logits = self.actor(obs)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(obs)


In [10]:
agent = Agent(envs).to(device) # ?does that require all return values to be a torch.Tensor?

### (3) Adam -- epsilon ###

In [11]:
# detail#3
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

### (4) Adam -- anneal learning rate ### 

In [12]:
#print(optimizer.param_groups[0])
print(optimizer.param_groups[0])

# don't really understand between optim.state_dict(), optim.state, and optim.param_groups

### ALGO ###

#### ALGO - init ####

In [13]:
# ALGO: PPO's storage
# PPO is similar to DQN in that it rollouts for some timesteps and then batch updates with the expereinces thence collected
# , therefore it needs storage

# another dimension of size train_interval over the dimensions of obs and actions, as they are returned by step()
class Memory:
    def __init__(self):
        # these memory tensors are overwritten during each rollouts
        self.obs = torch.zeros((args.train_interval, args.num_envs) + envs.single_observation_space.shape).to(device) # 128 x 8 x 4
        self.actions = torch.zeros((args.train_interval, args.num_envs) + envs.single_action_space.shape).to(device)
        self.logprobs = torch.zeros((args.train_interval, args.num_envs) + envs.single_action_space.shape).to(device) # they did not have single_action_space.shape; shouldn't this be torch.zeros_like(self.actions)?
        self.rewards = torch.zeros((args.train_interval, args.num_envs)).to(device)
        self.rstts = torch.zeros((args.train_interval, args.num_envs)).to(device)
        self.values = torch.zeros((args.train_interval, args.num_envs)).to(device)

memory = Memory()

# ALGO: iteration init
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=args.seed)
next_obs = torch.Tensor(next_obs).to(device)
next_rstt = torch.zeros(args.num_envs).to(device)

import datetime


#### ALOG - learn the types and dimensions ####

In [14]:
'''
observation_space and action_space inherits from gymnasium.spaces:
https://gymnasium.farama.org/api/spaces/#fundamental-spaces
isinstance(envs.observation_space, gym.spaces.Space) # True
isinstance(envs.observation_space, gym.spaces.Box) # True, 'box' in the sense that it has a upper- and lower- bound in every dimension of the R^n it resides in
isinstance(envs.observation_space, np.ndarray) # False
isinstance(envs.action_space, gym.spaces.Space) # True
isinstance(envs.action_space, gym.spaces.MultiDiscrete) # True, and is not gym.spaces.Discrete
isinstance(envs.single_action_space, gym.spaces.Discrete)
'''

'''
print("-- Observations --")
# a box has .low, .high, .shape, .sample(), ...
envs.single_observation_space.shape                 # (4, )
envs.observation_space.shape                        # (4, 4)
next_obs                                            # a size([4,4]) tensor

print("\n-- Actions --")
# a discrete has n, start, sample(), ...
envs.single_action_space                            # Discrete(2), this means the possible actions is the set {0, 1} = {start, start+1, ..., start+n-1}
envs.single_action_space.n                          # 2
envs.single_action_space.start                      # 0, default
envs.action_space                                   # MultiDiscrete([2 2 2 2])
envs.action_space.nvec                              # array([2, 2, 2, 2], dtype=int64), IS a np.ndarray
## 
_action, _log_probs, _entropys, _values = agent.get_action_and_value(next_obs)
_action                                             # a size([4]) tensor
_action.device                                      # device(type='cpu') ???
_log_probs                                          # a size([4]) tensor
_entropys                                           # a size([4]) tensor
_values                                             # a size([4,1]) tensor

print("\n-- Rewards --")
next_obs, rewards, terminations, truncations, infos, = envs.step(action.cpu().numpy())
rewards                                             # a size([4]) array

print("\n-- Values --")
agent.get_value(next_obs)                           # a size([4,1]) tensor

# some observations
## 1. tensors keep their num_envs(i.e. -2nd) dimension during rollouts, it is flattened only for the iterations/updates

'''


#### Tensorboard and wandb setup ####

#### ALGO - loop ####

In [16]:
# st at rt indexing

# loop till ~total_timesteps
for updates_n in range(1, args.num_iterations+1):
    if args.anneal_lr:
        # (4) linear anneal from 1 to 0; detail#4
        frac = 1.0 - (updates_n - 1.0) / args.num_iterations
        lrnow = frac * args.learning_rate + (1-frac) * args.learning_rate_final
        optimizer.param_groups[0]["lr"] = lrnow
        # (my) 
        ent_coef = frac * args.ent_coef + (1-frac) * args.ent_coef_final

    # (A) loop till train_interval
    for step in range(0, args.train_interval): # a rollout session of length train_interval
        # (A1) next becomes now
        global_step += args.num_envs
        memory.obs[step] = next_obs
        memory.rstts[step] = next_rstt

        # (A2) actor-critic inference
        with torch.no_grad(): # grad-descent only every num_iterations timesteps
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            memory.values[step] = value.flatten()
        memory.actions[step] = action
        memory.logprobs[step] = logprob

        # (A3) step
        next_obs, rewards, terminations, truncations, infos, = envs.step(action.cpu().numpy())
        memory.rewards[step] = torch.tensor(rewards).to(device).view(-1)
        next_obs = torch.Tensor(next_obs).to(device)
        next_rstt = torch.Tensor(np.logical_or(terminations, truncations)).to(device)

        # print training progress
        if "final_info" in infos:
            for info in infos["final_info"]:
                if info is None:
                    continue
                print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                break # even if two envs are done at the same step, showing one is enough
    
    # after each training_interval
    # (B) calc train_interval advantages and returns
    with torch.no_grad():
        # a backward td(λ) propagation of advantage. 
        # But how can you compare? 
        # A -100 next-state is good because a -10 action was picked, a +100 state is shunned because +10 action was picked. Because action converges to greedy? 
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(memory.rewards).to(device)
        lastgaelam = 0 # extremely confusing name, I bet it means gae_t+1 times gaelambda, undiscounted
        for t in reversed(range(args.train_interval)):
            # (1) resets eligibility trace if t+1 is a reset state
            if t == args.train_interval - 1:
                nextvalue = next_value
                nextnonreset = 1.0 - next_rstt
            else:
                nextvalue = memory.values[t+1]
                nextnonreset = 1.0 - memory.rstts[t+1]
            # (2)
            delta = memory.rewards[t] + nextnonreset * args.gamma * nextvalue - memory.values[t] # plain δ = rt + γ v_t+1 - vt = qt - vt = advantage
            advantages[t] = lastgaelam = delta + \
                    nextnonreset * args.gamma * args.lambda_gae * lastgaelam
    returns = advantages + memory.values
    
    # (C) actor-critic updates
    # (C1) flatten out the envs dimensions to the outermost dimension ('b' for batch)
    b_obs = memory.obs.reshape((-1,) + envs.single_observation_space.shape) # rehsape to (-1, 4)
    b_actions = memory.actions.reshape((-1,) + envs.single_action_space.shape)
    b_logprobs = memory.logprobs.reshape((-1,) + envs.single_action_space.shape) # again should share b_actions shape
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = memory.values.reshape(-1)

    # (C2) minibatch updates; detail#6
    b_idxs = np.arange(args.batch_size)
    clipfracs = []
    for epoch in range(args.update_epochs):
        np.random.shuffle(b_idxs)
        for start_idx in range(0, args.batch_size, args.minibatch_size):
            # ( ) prob ratio
            mb_idxs = b_idxs[start_idx:start_idx+args.minibatch_size] # indices of memory used in this minibatch

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(
                b_obs[mb_idxs], b_actions.long()[mb_idxs] # why long()=to(int64)?
            )
            logratio = newlogprob - b_logprobs[mb_idxs] # the 0th minibatch's newlogprob and b_logprob are the same, each optimizer.step() advances the actor-crtic away from memory.logprobs
            ratio = logratio.exp() # prob ratio, 

            # debug variables; detail#12
            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] # fraction in batch whose pg_loss will be clipped, list additon concats, then averaged in plot progress


            # ( ) advantage normalization, detail#7, minibatch-level
            mb_advantages = b_advantages[mb_idxs]
            if args.norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # ( ) clip policy loss, detail#8
            pg_loss1 = -mb_advantages * ratio # the surrogate advantage used by TRPO; TRPO uses a KL-divergence constraint, PPO uses clipping instead (detail#9)
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1-args.clip_coef, 1+args.clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean() # pg_loss is -ve

            # ( ) clip value loss, detail#9
            newvalue = newvalue.view(-1)
            if args.clip_vloss:
                vunclipped_loss = (newvalue - b_returns[mb_idxs]) ** 2 # newvalue is from agent.critic, b_returns is from train interval GAE
                newvalue_clipped = b_values[mb_idxs] + torch.clamp( # newvalue clipped towards memory.values
                    newvalue - b_values[mb_idxs], 
                    -args.clip_coef, 
                    args.clip_coef, 
                )
                vclipped_loss = (newvalue_clipped - b_returns[mb_idxs]) ** 2
                v_loss_max = torch.max(vunclipped_loss, vclipped_loss)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_idxs]) ** 2).mean()

            # ( ) entropy loss, detail#10
            # minimize policy loss and value loss (improve actor and critic) 
            # but maximize entropy loss (i.e. try to make π(a) even over a to encourage exploration)
            entropy_loss = entropy.mean()
            loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

            # ( ) backpropagation
            optimizer.zero_grad()
            loss.backward()
            # clip_grad_norm_ rescale all gradient steps by the same ratio such that the sum of the norm of all steps equals to  max_norm, i.e. the 2nd argumant
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) # global gradient clipping, detail#11
            optimizer.step()

    # debug variable; detail#12
    y_pred = b_values.cpu().numpy() # critic's prediction (one step TD look-ahead) during rollout
    y_true = b_returns.cpu().numpy() # based on GAE calculated from train interval trajectories, confusing name
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    # progress plots
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("losses/value_loss", v_loss.item(), global_step) # is mean-square-error
    writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) # ~ how much the new policy fares better than the previous one
    writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
    writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
    writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) # how many oversized policy gradient step prevented
    writer.add_scalar("losses/explained_variance", explained_var, global_step) # ~ how well critic is doing

experiment_duration = time.time() - start_time
print(f"global_step = {global_step}")
print(f"experiment_duration = {experiment_duration:.2f} seconds")
writer.add_text("experiment_duration", f"{experiment_duration:.2f}", global_step)
envs.close()
writer.close()



## (D) Evaluation ##

In [17]:
from huggingface_hub import HfApi, upload_folder
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import tempfile
import json
import shutil
import imageio

from wasabi import Printer

msg = Printer()

In [None]:
"""
we evaluate the agent first because 
(a) The course's record video function often crashes
(b) RecordVideo does not allow fully custom video naming
(c) Technically you should evaluate, push or not
"""

def evaluate_agent(env, n_eval_episodes, policy, hyperparameters=args):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param policy: The agent
    """
    # (1) evaluate
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state, _ = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        while done is False:
            state = torch.Tensor(state).to(device)
            action, _, _, _ = policy.get_action_and_value(state)
            new_state, reward, terminated, truncated, info = env.step(action.cpu().numpy())
            total_rewards_ep += reward
            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
        print(f"episode {episode:2}: reward={total_rewards_ep:5.1f}")
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    # (2) metadata
    eval_datetime = datetime.datetime.now()
    eval_form_datetime = eval_datetime.isoformat()
    evaluate_data = {
        "env_id": hyperparameters.env_id,
        "mean_reward": mean_reward,
        "std_reward": std_reward,
        "n_evaluation_episodes": n_eval_episodes,
        "eval_datetime": eval_form_datetime,
    }

    return mean_reward, std_reward, evaluate_data

eval_env = gym.make(args.env_id, render_mode="rgb_array")
eval_env = gym.wrappers.RecordVideo(eval_env, video_folder="upload", step_trigger=lambda x: x==0, video_length=1000) # 1000=20sec (not 30FPS?)

eval_mean_reward, eval_std_reward, evaluate_data = evaluate_agent(eval_env, 10, agent)
print(f"mean_reward={eval_mean_reward:.2f}")
print(f"std_reward={eval_std_reward:.2f}")


## (E) Push ##

In [None]:
def package_to_hub(
    repo_id,
    model,
    hyperparameters,
    commit_message="Push agent to the Hub",
    token=None,
    logs=None,
):
    msg.info(
        "This function will save, evaluate, generate a video of your agent, create a model card and push everything to the hub. "
    )

    # convert to same API as original code
    mean_reward = eval_mean_reward
    std_reward = eval_std_reward 

    # Step 1: Clone or create the repo
    repo_url = HfApi().create_repo(
        repo_id=repo_id,
        token=token,
        private=False,
        exist_ok=True,
    )

    # Step 2: Save the model
    dirname = "upload"
    path = Path(dirname)
    torch.save(model.state_dict(), path / "model.pt")

    # Step 3: Get evaluation results (eval_mean_reward, eval_std_reward, evaluate_data)
    print(f"mean_reward={mean_reward:.2f}")
    print(f"std_reward={std_reward:.2f}")
    # Step 3a: write to JSON
    with open(path / "results.json", "w") as outfile:
            json.dump(evaluate_data, outfile)
    
    # Step 4: Rename the video (evaluate_agent generates video as "upload/rl-video-step-0.mp4")
    if os.path.isfile(path / "rl-video-step-0.mp4"):
        os.rename(path / "rl-video-step-0.mp4", path / "replay.mp4")
    else:
        print("Warning: Using existing replay.mp4")
    
    # Step 5: Generate the model card
    generated_model_card, metadata = _generate_model_card(
        "PPO", hyperparameters.env_id, mean_reward, std_reward, hyperparameters
    )
    _save_model_card(path, generated_model_card, metadata)

    # Step 6: Add logs if needed
    if logs:
        _add_logdir(path, Path(logs))

    msg.info(f"Pushing repo {repo_id} to the Hugging Face Hub")
    # Step 7: Push
    repo_url = upload_folder(
            repo_id=repo_id,
            folder_path=path,
            path_in_repo="",
            commit_message=commit_message,
            token=token,
        )

    msg.info(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
    
    return repo_url


def _generate_model_card(model_name, env_id, mean_reward, std_reward, hyperparameters):
    """
    Generate the model card for the Hub
    :param model_name: name of the model
    :env_id: name of the environment
    :mean_reward: mean reward of the agent
    :std_reward: standard deviation of the mean reward of the agent
    :hyperparameters: training arguments
    """
    # Step 1: Select the tags
    metadata = generate_metadata(model_name, env_id, mean_reward, std_reward)

    # Transform the hyperparams namespace to string
    converted_dict = vars(hyperparameters)
    converted_str = str(converted_dict)
    converted_str = converted_str.split(", ")
    converted_str = "\n".join(converted_str)

    # Step 2: Generate the model card
    model_card = f"""
  # PPO Agent Playing {env_id}

  This is a trained model of a PPO agent playing {env_id}.

  # Hyperparameters
  """
    return model_card, metadata

def generate_metadata(model_name, env_id, mean_reward, std_reward):
    """
    Define the tags for the model card
    :param model_name: name of the model
    :param env_id: name of the environment
    :mean_reward: mean reward of the agent
    :std_reward: standard deviation of the mean reward of the agent
    """
    metadata = {}
    metadata["tags"] = [
        env_id,
        "ppo",
        "deep-reinforcement-learning",
        "reinforcement-learning",
        "custom-implementation",
        "deep-rl-course",
    ]

    # Add metrics
    eval = metadata_eval_result(
        model_pretty_name=model_name,
        task_pretty_name="reinforcement-learning",
        task_id="reinforcement-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_id,
        dataset_id=env_id,
    )

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    return metadata

def _save_model_card(local_path, generated_model_card, metadata):
    """Saves a model card for the repository.
    :param local_path: repository directory
    :param generated_model_card: model card generated by _generate_model_card()
    :param metadata: metadata
    """
    readme_path = local_path / "README.md"
    readme = ""
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
            readme = f.read()
    else:
        readme = generated_model_card

    with readme_path.open("w", encoding="utf-8") as f:
        f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)


def _add_logdir(local_path: Path, logdir: Path):
    """Adds a logdir to the repository.
    :param local_path: repository directory
    :param logdir: logdir directory
    """
    if logdir.exists() and logdir.is_dir():
        # Add the logdir to the repository under new dir called logs
        repo_logdir = local_path / "logs"

        # Delete current logs if they exist
        if repo_logdir.exists():
            shutil.rmtree(repo_logdir)

        # Copy logdir into repo logdir
        shutil.copytree(logdir, repo_logdir)


In [None]:
hub_repo_id = "Rudolph314/ppo-LunarLander-v2"

package_to_hub(
    repo_id=hub_repo_id,
    model=agent,
    hyperparameters=args,
    commit_message="modified from CleanRL",
    logs=f"runs/eval_{run_name}",
)

## (F) Save ##

In [None]:
torch.save(agent.state_dict(), f"models/{run_name}.pt")

## (G) Remarks ##

Note that landing outside of the pad is legit although the rough terrain there makes it difficult. ~~Perhaps because of this, the agent sees a huge advantage and often ventures there~~. Sometimes they don't crash but instead tries to stabilize itself with thrusters while sliding / rotate when tilted (this situation also happens when one feet is on the pad). See also
https://github.com/DLR-RM/stable-baselines3/issues/1669

The default ent_coef is 1e-2 and the tyical value of entropy_loss is 1. I observed that the policy_loss of LunarLander is ~ 1e-2 to 1e-3, wouldn't the entropy be too large??

The video's learning_rate, num_envs, train_interval, epochs, num_minibatches as a whole is fairly expereience-inefficient. Many problems / local optima can be solved by raising them. 