# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb
import numpy as np

from model.ppo import PPO
from env.wrappers import LunarContinuous
from logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0  ,                           # How often we save in number of iterations
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [3]:
TOTAL_TIMESTEPS_TO_TRAIN = 500_000
VAL_ITER = 20
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'val_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            'values': [0.001, 0.002, 0.003, 0.004, 0.005]
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        # 'lr_gamma': {
        #     'min': 0.999,
        #     'max': 1.
        # },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            'min': 0.1,
            'max': 5.
        },
        'val_loss_coef': {
            'min': 0.1,
            'max': 5.
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'max_timesteps_per_episode': {
            'distribution': 'q_uniform',
            'min': 600,
            'max': 2000,
            'q': 100
        },
        'timesteps_per_batch': {
            'distribution': 'q_uniform',
            'min': 1_000,
            'max': 50_000,
            'q': 1000
        },
        'max_timesteps_per_episode': {
            'distribution': 'q_uniform',
            'min': 800,
            'max': 2_000,
            'q': 100
        },
        'n_sgd_batches': {
            'distribution': 'q_uniform',
            'min': 1,
            'max': 128,
            'q':4
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train(TOTAL_TIMESTEPS_TO_TRAIN)

    val_rews, val_dur = ppo.validate(VAL_ITER, False)

    wandb.log({
        "val_rewards": np.mean(val_rews),
        "validation_duration": np.mean(val_dur)
        # "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: jybzfxra
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/jybzfxra


[34m[1mwandb[0m: Agent Starting Run: 1yet6s2b with config:
[34m[1mwandb[0m: 	gamma: 0.9826868362048872
[34m[1mwandb[0m: 	lam: 0.978874488082296
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	max_grad_norm: 3.2470889090683324
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1600
[34m[1mwandb[0m: 	n_sgd_batches: 116
[34m[1mwandb[0m: 	n_updates_per_iteration: 18
[34m[1mwandb[0m: 	timesteps_per_batch: 38000
[34m[1mwandb[0m: 	val_loss_coef: 2.513728444800542
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin


  logger.warn(



-------------------- Iteration #1 --------------------
Average Episodic Length: 110.57
Average Episodic Return: -240.71
Average Loss: -0.02525
KL Divergence: 0.009735152125358582
Timesteps So Far: 10062
Iteration took: 10.4 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 117.66
Average Episodic Return: -208.61
Average Loss: -0.04584
KL Divergence: 0.009371411055326462
Timesteps So Far: 20181
Iteration took: 12.01 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 120.92
Average Episodic Return: -120.07
Average Loss: -0.01066
KL Divergence: 0.012163585051894188
Timesteps So Far: 30459
Iteration took: 11.22 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #4 --------------------


0,1
average_episode_lengths,▁▁▁▁▁▂▃▄▄▄▆▆▇▆▆▇▇▆▆▆▅▆▆▇▇▆▇▇▇█▆▇█▆█▇▆▇▆▆
average_episode_rewards,▁▂▃▄▄▄▅▅▄▄▄▅▅▅▅▆▆▆▆▆▆▆▅▅▆▇▇▆▇▇▇▇█▇▇██▇█▇
average_loss,▂▁▃▅▅███▆▆▆▆▅▅▅▅▅▅▅▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
learning_rate,█████████████████▆██▄██████▁▃▆▁████▆█▆▆▆
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_rewards,▁
validation_duration,▁

0,1
average_episode_lengths,918.45455
average_episode_rewards,141.45614
average_loss,0.01592
episode_compute,16.75
learning_rate,0.00667
simulated_iterations,48.0
simulated_timesteps,503667.0
val_rewards,37.80217
validation_duration,7458.65


[34m[1mwandb[0m: Agent Starting Run: oyyvthl7 with config:
[34m[1mwandb[0m: 	gamma: 0.9665704623117743
[34m[1mwandb[0m: 	lam: 0.9229636529454516
[34m[1mwandb[0m: 	lr: 0.002
[34m[1mwandb[0m: 	max_grad_norm: 2.3702924542860746
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1500
[34m[1mwandb[0m: 	n_sgd_batches: 60
[34m[1mwandb[0m: 	n_updates_per_iteration: 9
[34m[1mwandb[0m: 	timesteps_per_batch: 25000
[34m[1mwandb[0m: 	val_loss_coef: 3.0439120899240057


  logger.warn(



-------------------- Iteration #1 --------------------
Average Episodic Length: 110.58
Average Episodic Return: -236.17
Average Loss: 0.2243
KL Divergence: 0.013698579743504524
Timesteps So Far: 10063
Iteration took: 10.23 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 118.66
Average Episodic Return: -175.13
Average Loss: 0.13284
KL Divergence: 0.01960108056664467
Timesteps So Far: 20149
Iteration took: 10.82 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 133.96
Average Episodic Return: -106.8
Average Loss: 0.06772
KL Divergence: 0.016528235748410225
Timesteps So Far: 30196
Iteration took: 12.82 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #4 --------------------
Avera

0,1
average_episode_lengths,▁▁▁▁▁▂▂▄▄▅▆▅▇▇▇▇▇▇▇▆██▆▆█▇▇▆▆▇██▇▇▅▅▃▃▃▂
average_episode_rewards,▁▂▃▃▄▄▄▄▄▄▄▄▅▅▅▄▅▅▅▅▆▆▆▃▆▆▆▆▆▆▇▇▆▇▇█████
average_loss,█▅▃▂▂▂▁▂▂▂▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
learning_rate,█████████████▆█▆███▆███▁▄▆▄▆█▁██▄▆▄▁▃▄▆▆
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_rewards,▁
validation_duration,▁

0,1
average_episode_lengths,311.78788
average_episode_rewards,235.62134
average_loss,0.00452
episode_compute,16.51
learning_rate,0.00667
simulated_iterations,48.0
simulated_timesteps,501008.0
val_rewards,249.60936
validation_duration,290.2


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vmvjxp6m with config:
[34m[1mwandb[0m: 	gamma: 0.9513506985023636
[34m[1mwandb[0m: 	lam: 0.910893053061852
[34m[1mwandb[0m: 	lr: 0.004
[34m[1mwandb[0m: 	max_grad_norm: 0.4963253536602791
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1600
[34m[1mwandb[0m: 	n_sgd_batches: 28
[34m[1mwandb[0m: 	n_updates_per_iteration: 2
[34m[1mwandb[0m: 	timesteps_per_batch: 18000
[34m[1mwandb[0m: 	val_loss_coef: 2.057665251880479


  logger.warn(



-------------------- Iteration #1 --------------------
Average Episodic Length: 108.42
Average Episodic Return: -252.98
Average Loss: 0.20459
KL Divergence: 0.012001258321106434
Timesteps So Far: 10083
Iteration took: 11.23 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 109.08
Average Episodic Return: -169.02
Average Loss: 0.12257
KL Divergence: 0.009495409205555916
Timesteps So Far: 20118
Iteration took: 10.4 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 143.23
Average Episodic Return: -105.79
Average Loss: 0.08969
KL Divergence: 0.013393177650868893
Timesteps So Far: 30144
Iteration took: 12.35 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #4 --------------------
Ave

0,1
average_episode_lengths,▁▁▁▁▁▂▃▄▅▆▆▇▇▇▇██▇▇▇▇█▅▅▆█▇▇▆▅▅▅▆▇▅▅▇▇▇▇
average_episode_rewards,▁▃▄▄▅▄▄▅▅▆▅▆▆▇▆▇▇▇▇▇▇█▇▇▇█▇▇▇▇▇▇██▇▇██▇█
average_loss,█▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
learning_rate,██████████▁▄▁▆▂█████████▂██▆▆███▁▂▆▄██▁▆
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_rewards,▁
validation_duration,▁

0,1
average_episode_lengths,1025.3
average_episode_rewards,127.73149
average_loss,0.02863
episode_compute,18.41
learning_rate,0.00667
simulated_iterations,48.0
simulated_timesteps,504037.0
val_rewards,66.14648
validation_duration,19048.75


[34m[1mwandb[0m: Agent Starting Run: daa2znus with config:
[34m[1mwandb[0m: 	gamma: 0.9766663147873486
[34m[1mwandb[0m: 	lam: 0.9839644017598722
[34m[1mwandb[0m: 	lr: 0.002
[34m[1mwandb[0m: 	max_grad_norm: 1.8436116794568176
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1000
[34m[1mwandb[0m: 	n_sgd_batches: 24
[34m[1mwandb[0m: 	n_updates_per_iteration: 6
[34m[1mwandb[0m: 	timesteps_per_batch: 18000
[34m[1mwandb[0m: 	val_loss_coef: 1.103472254457756


  logger.warn(



-------------------- Iteration #1 --------------------
Average Episodic Length: 114.08
Average Episodic Return: -245.99
Average Loss: 0.06583
KL Divergence: 0.011289307847619057
Timesteps So Far: 10039
Iteration took: 10.09 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 128.5
Average Episodic Return: -198.94
Average Loss: -0.06245
KL Divergence: 0.012918258085846901
Timesteps So Far: 20062
Iteration took: 13.16 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 131.41
Average Episodic Return: -115.66
Average Loss: -0.06655
KL Divergence: 0.012122180312871933
Timesteps So Far: 30838
Iteration took: 14.3 secs
Current learning rate: 0.01
------------------------------------------------------


-------------------- Iteration #4 --------------------
Av

In [None]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'lr': 0.033773968633186116, 'lam': 0.965122224947915, 'gamma': 0.9391731546579618, 'lr_gamma': 0.99964198121568, 'max_grad_norm': 1.0549291822676827, 'n_sgd_batches': 8, 'timesteps_per_batch': 6600, 'n_updates_per_iteration': 17, 'max_timesteps_per_episode': 800}
Best Metrics: {'val_rewards': -117.685825451997, '_runtime': 18.8714706, '_step': 2, '_timestamp': 1733497014.9633105, '_wandb': {'runtime': 18}, 'average_episode_lengths': 97.5, 'average_episode_rewards': -214.25900286086735, 'average_loss': 0.001098420703783631, 'learning_rate': 0.004990008995201681, 'max_reward_video': {'_type': 'video-file', 'path': 'media/videos/max_reward_video_2_de1368eb4a9cffe45bc9.mp4', 'sha256': 'de1368eb4a9cffe45bc98fb1781e53c0619e8cb010e012a453b94f289e4f54ad', 'size': 10645}, 'simulated_iterations': 2, 'simulated_timesteps': 9719, 'validation_duration': 76.4}
