# Proximal Policy Optimization (PPO)

source: https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo.py

In [2]:
import time
import wandb
import random
import numpy as np
from tqdm import tqdm
import multiprocessing
import gymnasium as gym

from src.env import CustomLunarLander
from src.models import PPO_Agent
from src import util

import torch
import torch.nn as nn

wandb.login()
np.seterr(all='raise'); # raise exceptions on errors
print(f"Number of cores available: {multiprocessing.cpu_count()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # device for pytorch
gym.register(id="CustomLunarLander-v0", entry_point=CustomLunarLander)

[34m[1mwandb[0m: Currently logged in as: [33mthomasvroom[0m ([33mthomasvroom-maastricht-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Number of cores available: 12
Using device: cuda


In [3]:
def train(config, run_name):
    run = wandb.init(
        project="RL",
        entity="thomasvroom-maastricht-university",
        config=config,
        name=run_name
    )

    # seeding
    random.seed(config["random_seed"])
    np.random.seed(config["random_seed"])
    torch.manual_seed(config["random_seed"])
    torch.backends.cudnn.deterministic = config["deterministic"]

    # create environments
    envs = gym.vector.SyncVectorEnv([
        util.make_env(
            env_id="CustomLunarLander-v0",
            gravity=config["gravity"],
            enable_wind=config["enable_wind"],
            wind_power=config["wind_power"],
            turbulence_power=config["turbulence_power"],
            max_episode_steps=config["max_env_steps"]
        ) for _ in range(config["n_envs"])],
    )

    agent = PPO_Agent(np.array(envs.single_observation_space.shape).prod(), envs.single_action_space.n).to(device)
    optimizer = torch.optim.Adam(agent.parameters(), lr=config["learning_rate"], eps=1e-5)

    # more hyperparameters determined at runtime
    batch_size = int(config["n_envs"] * config["steps_per_batch"])
    minibatch_size = int(batch_size // config["num_minibatches"])
    num_iterations = int(config["train_steps"] // batch_size)

    # storage setup
    obs = torch.zeros((config["steps_per_batch"], config["n_envs"]) + envs.single_observation_space.shape).to(device)
    actions = torch.zeros((config["steps_per_batch"], config["n_envs"]) + envs.single_action_space.shape).to(device)
    logprobs = torch.zeros((config["steps_per_batch"], config["n_envs"])).to(device)
    rewards = torch.zeros((config["steps_per_batch"], config["n_envs"])).to(device)
    dones = torch.zeros((config["steps_per_batch"], config["n_envs"])).to(device)
    values = torch.zeros((config["steps_per_batch"], config["n_envs"])).to(device)

    # start the environment
    global_step = 0
    start_time = time.time()
    next_obs, _ = envs.reset(seed=config["random_seed"])
    next_obs = torch.Tensor(next_obs).to(device)
    next_done = torch.zeros(config["n_envs"]).to(device)

    for iteration in tqdm(range(1, num_iterations + 1)):
        # annealing the learning rate
        if config["anneal_lr"]:
            frac = 1.0 - (iteration - 1.0) / num_iterations
            lrnow = frac * config["learning_rate"]
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, config["steps_per_batch"]):
            global_step += config["n_envs"]
            obs[step] = next_obs
            dones[step] = next_done

            # action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # execute the action and log data
            next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
            next_done = np.logical_or(terminations, truncations)
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)

            if infos and "episode" in infos:
                indices = np.where(infos["_episode"])
                run.log({
                    "charts/episodic_return": infos["episode"]["r"][indices].mean(),
                    "charts/episodic_length": infos["episode"]["l"][indices].mean()
                }, global_step)

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(config["steps_per_batch"])):
                if t == config["steps_per_batch"] - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    nextvalues = values[t + 1]
                delta = rewards[t] + config["gamma"] * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + config["gamma"] * config["gae_lambda"] * nextnonterminal * lastgaelam
            returns = advantages + values

        # flatten the batch
        b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # optimizing the policy and value network
        b_inds = np.arange(batch_size)
        clipfracs = []
        for epoch in range(config["policy_epochs"]):
            np.random.shuffle(b_inds)
            for start in range(0, batch_size, minibatch_size):
                end = start + minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > config["clip_coef"]).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if config["norm_adv"]:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                # policy loss
                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - config["clip_coef"], 1 + config["clip_coef"])
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # value loss
                newvalue = newvalue.view(-1)
                if config["clip_vloss"]:
                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -config["clip_coef"],
                        config["clip_coef"],
                    )
                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - config["ent_coef"] * entropy_loss + v_loss * config["vf_coef"]

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), config["max_grad_norm"])
                optimizer.step()

            if config["target_kl"] is not None and approx_kl > config["target_kl"]:
                break

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # record data
        run.log({
            "charts/learning_rate": optimizer.param_groups[0]["lr"],
            "losses/value_loss": v_loss.item(),
            "losses/policy_loss": pg_loss.item(),
            "losses/entropy": entropy_loss.item(),
            "losses/old_approx_kl": old_approx_kl.item(),
            "losses/approx_kl": approx_kl.item(),
            "losses/clipfrac": np.mean(clipfracs),
            "charts/SPS": int(global_step / (time.time() - start_time))
        }, global_step)
        if not np.isnan(explained_var):
            run.log({"losses/explained_variance": explained_var}, global_step)

    envs.close()
    run.finish(0)
    torch.save(agent.state_dict(), f"models/{run_name}")

config = { # see: https://gymnasium.farama.org/environments/box2d/lunar_lander/
    "gravity": -10.0,
    "wind_power": 15.0,
    "turbulence_power": 1.5,

    "random_seed": 123,
    "deterministic": True, # toggles torch.backends.cudnn.deterministic
    "n_envs": 8,
    "train_steps": 4_000_000,
    "steps_per_batch": 2048, # number of steps to run in each env per policy rollout
    "num_minibatches": 16,
    "policy_epochs": 8, # number of epochs to update the policy
    "max_env_steps": 1000, # number of steps before truncation

    "gamma": 0.99,
    "learning_rate": 3e-4,
    "anneal_lr": True, # toggles lr decay
    "gae_lambda": 0.95, # lambda for the general advantage estimation
    "clip_coef": 0.2, # surrogate clipping coefficient
    "norm_adv": True, # toggles advantage normalization
    "clip_vloss": False, # toggles use of clipped loss for the value function
    "ent_coef": 0.01, # entropy coefficient
    "vf_coef": 0.5, # value function coefficient
    "max_grad_norm": 0.5, # maximum norm for gradient clipping
    "target_kl": 0.01 # the target KL divergence threshold
}

### Training without wind

In [None]:
config["enable_wind"] = False
run_name = f"PPO-NoWind-{time.time()}"
train(config, run_name)

### Training with wind

In [4]:
config["enable_wind"] = True
run_name = f"PPO-Wind-{time.time()}"
train(config, run_name)

100%|██████████| 244/244 [26:08<00:00,  6.43s/it]


0,1
charts/SPS,▁▄▆▇▇▇▇▇▇█▇█████████████████████████████
charts/episodic_length,▁▁▁▁▁▁▂▁▁▁▂▁▂▂█▃▄▃▄▄▂▂█▄▃▂▂▆▁▃▆▃▂▂▂▂▂███
charts/episodic_return,▂▁▄▄▃▄▄▄▃▄▆▅▄▄▄▄▄▇▇█▇▄▇▇▇▇▃▆█▇▇█▇▇▇█▇▇██
charts/learning_rate,██▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁▁▁
losses/approx_kl,▆█▄▅▇▅▅▃▃▄▄▄▄▄▄▃▃▃▄▄▄▅▄▄▃▅▃▄▃▃▂▄▃▄▃▁▃▄▂▁
losses/clipfrac,▁▅▄▅▃█▅▂▄▃▂▄▃▂▃▁▂▂▃▂▃▃▂▄▂▂▁▂▂▂▂▄▂▁▂▃▁▁▁▁
losses/entropy,██▇▇▆▅▅▅▄▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▂▁▁▁▁
losses/explained_variance,▁▁▁▁▁▁▁▁▁▁▁▂▃▄▄▄▄▅▄▆▆▆▇▇▇▆▇▇▇██▇▇▇▇▇██▇▇
losses/old_approx_kl,▁█▆▅▄▅▃▃▅▂▂▆▄▄▆▅▂▄▄▂▅▂▂▂▄▃▄▂▃▅▂▄▃▁▂▁▃▂▁▂
losses/policy_loss,▄▃▅▁▅█▃▄▆▄▅▆▄▆▆▃▅▅▆▄▅▆▅▅▅▆▅▆▅▆▆▆▅▆▄▄▄▆▆▆

0,1
charts/SPS,2548.0
charts/episodic_length,236.0
charts/episodic_return,260.47493
charts/learning_rate,0.0
losses/approx_kl,0.0
losses/clipfrac,0.0
losses/entropy,0.49484
losses/explained_variance,0.71619
losses/old_approx_kl,2e-05
losses/policy_loss,-6e-05


### Visualize Episode

In [None]:
# run_name = 

# load agent
agent = PPO_Agent(8, 4).to(device)
agent.load_state_dict(torch.load(f"models/{run_name}"))
class AgentWrapper:
    def get_action(self, obs):
        return agent.get_action_and_value(obs)[0]
w_agent = AgentWrapper()

util.visualize_episode(
    env_id="CustomLunarLander-v0",
    gravity=config["gravity"],
    enable_wind=False,
    wind_power=config["wind_power"],
    turbulence_power=config["turbulence_power"],
    agent=w_agent,
    device=device,
    max_time=30,
    video_name=None
)