# PPO
---

### 1. Import the Necessary Packages

In [None]:
import gym
import torch
%matplotlib inline
import wandb
import numpy as np
import os

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarContinuous
from gym.wrappers import RecordVideo

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 50  ,                           # How often we save in number of iterations
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [None]:
TOTAL_TIMESTEPS_TO_TRAIN = 1_500_000
VAL_ITER = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'average_episode_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            'min': 0.0001,
            'max': 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'lr_gamma': {
            'min': 0.999,
            'max': 1.
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'max_timesteps_per_episode': {
            'values': list(range(600, 2001, 200))
        },
        'timesteps_per_batch': {
            'values': list(range(600, 5001, 200))
        }
    }
}

In [9]:
def train_model(config = None):
    wandb.login()
    run = wandb.init(
        # Track hyperparameters and run metadata
        config= config
    )
    ppo = ppo = PPO(**misc_hyperparameters) if config is None else PPO( lr = config.lr, gamma = config.gamma, n_updates_per_iteration = config.n_updates_per_iteration, max_timesteps_per_episode = config.max_timesteps_per_episode, **misc_hyperparameters)
    ppo.learn(TOTAL_TIMESTEPS_TO_TRAIN)

    val_rews = []
    val_dur = []
    env = LunarContinuous(render_mode='rgb_array').make_environment()
    env = RecordVideo(env, video_folder="videos", episode_trigger=lambda x: True)
    for _ in range(0, VAL_ITER):
            obs, _ = env.reset()
            done = False

            # number of timesteps so far
            t = 0

            ep_ret = 0            # episodic return

            while not done:
                t += 1

                # Query deterministic action from policy and run it
                action = ppo.actor(obs)
                obs, rew, terminated, truncated, _ = env.step(action.detach().numpy())
                done = terminated | truncated

                # Sum all episodic rewards as we go along
                ep_ret += rew
                
            # Track episodic length
            val_rews.append(ep_ret)
            val_dur.append(t)

    wandb.log({
        "val_rewards": np.mean(val_rews),
        "validation_duration": np.mean(val_dur),
        "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [10]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model)

Create sweep with ID: h5g3bir3
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/h5g3bir3


[34m[1mwandb[0m: Agent Starting Run: 6i5gfyid with config:
[34m[1mwandb[0m: 	gamma: 0.9800051053356386
[34m[1mwandb[0m: 	lr: 0.019919795713659736
[34m[1mwandb[0m: 	lr_gamma: 0.999424832404954
[34m[1mwandb[0m: 	max_timesteps_per_episode: 600
[34m[1mwandb[0m: 	n_updates_per_iteration: 8
[34m[1mwandb[0m: 	timesteps_per_batch: 3800



-------------------- Iteration #1 --------------------
Average Episodic Length: 121.72
Average Episodic Return: -350.38
Average Loss: 0.01266
Timesteps So Far: 4869
Iteration took: 7.72 secs
Current learning rate: 0.0047549502495
------------------------------------------------------



  logger.warn(


MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4





MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4


0,1
average_episode_lengths,▁
average_episode_rewards,▁
average_loss,▁
learning_rate,▁
simulated_iterations,▁
simulated_timesteps,▁
val_rewards,▁
validation_duration,▁

0,1
average_episode_lengths,121.725
average_episode_rewards,-350.38248
average_loss,0.01266
learning_rate,0.00475
simulated_iterations,1.0
simulated_timesteps,4869.0
val_rewards,-290.33231
validation_duration,99.0


[34m[1mwandb[0m: Agent Starting Run: 42j7iuz1 with config:
[34m[1mwandb[0m: 	gamma: 0.9600376453036754
[34m[1mwandb[0m: 	lr: 0.022120079771052095
[34m[1mwandb[0m: 	lr_gamma: 0.999386020821564
[34m[1mwandb[0m: 	max_timesteps_per_episode: 800
[34m[1mwandb[0m: 	n_updates_per_iteration: 13
[34m[1mwandb[0m: 	timesteps_per_batch: 1200



-------------------- Iteration #1 --------------------
Average Episodic Length: 104.02
Average Episodic Return: -186.91
Average Loss: 0.01161
Timesteps So Far: 4889
Iteration took: 6.77 secs
Current learning rate: 0.0047549502495
------------------------------------------------------



  logger.warn(


MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4



                                                                       

MoviePy - Done !




MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video-episode-0.mp4


0,1
average_episode_lengths,▁
average_episode_rewards,▁
average_loss,▁
learning_rate,▁
simulated_iterations,▁
simulated_timesteps,▁
val_rewards,▁
validation_duration,▁

0,1
average_episode_lengths,104.02128
average_episode_rewards,-186.90621
average_loss,0.01161
learning_rate,0.00475
simulated_iterations,1.0
simulated_timesteps,4889.0
val_rewards,-89.42588
validation_duration,69.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
