# PPO
---

### 1. Import the Necessary Packages

In [7]:
%matplotlib inline
import wandb
import numpy as np

from model.ppo3 import PPO
from env.wrappers import LunarContinuous
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [8]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarContinuous
}

Initialise wandb session in the code cell below.

In [9]:
MAX_TOTAL_TIMESTEPS_TO_TRAIN = 500
VAL_ITER = 20
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'validation_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        # 'lr_gamma': {
        #     'min': 0.999,
        #     'max': 1.
        # },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            "distribution": "q_log_uniform",
            "min": 0.1,
            "max": 10, 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 100))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        'num_steps': {
            'distribution': 'q_uniform',
            'min': 300,
            'max': 4000,
            'q': 100
        },'batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    },
     "constraints": [
        {"params": ["num_envs", "num_steps"], "max_product": MAX_TOTAL_TIMESTEPS_TO_TRAIN}
    ]
}

In [10]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train()

    val_rews, val_dur = ppo.validate(VAL_ITER, False)

    wandb.log({
        "validation_rewards": val_rews,
        "validation_duration": val_dur
        # "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)



Create sweep with ID: q4p6pbl8
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/q4p6pbl8


[34m[1mwandb[0m: Agent Starting Run: 1dgokfx0 with config:
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	batches: 808
[34m[1mwandb[0m: 	gamma: 0.9736512887162576
[34m[1mwandb[0m: 	lam: 0.992544563139167
[34m[1mwandb[0m: 	lr: 0.03559224270177326
[34m[1mwandb[0m: 	max_grad_norm: 4847
[34m[1mwandb[0m: 	n_updates_per_iteration: 2
[34m[1mwandb[0m: 	num_envs: 26
[34m[1mwandb[0m: 	num_steps: 2200



-------------------- Iteration #1 --------------------
Average Episodic Return: -348.75
Average Loss: -0.00027
Average KL Divergence: 0.007431268561720991
Iteration took: 12.78 secs, of which rollout took 10.99 secs and gradient updates took 1.79 secs
Current learning rate: 0.005
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -128.63
Average Loss: -0.00165
Average KL Divergence: 0.006857339117428077
Iteration took: 15.43 secs, of which rollout took 13.28 secs and gradient updates took 2.13 secs
Current learning rate: 0.0049005
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -113.75
Average Loss: -0.00192
Average KL Divergence: 0.007583392630168908
Iteration took: 13.28 secs, of which rollout took 11.16 secs and gradient updates took 2.1 secs
Current learning rate: 0.0047064402
--------------------------------

In [None]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'lr': 0.033773968633186116, 'lam': 0.965122224947915, 'gamma': 0.9391731546579618, 'lr_gamma': 0.99964198121568, 'max_grad_norm': 1.0549291822676827, 'n_sgd_batches': 8, 'timesteps_per_batch': 6600, 'n_updates_per_iteration': 17, 'max_timesteps_per_episode': 800}
Best Metrics: {'val_rewards': -117.685825451997, '_runtime': 18.8714706, '_step': 2, '_timestamp': 1733497014.9633105, '_wandb': {'runtime': 18}, 'average_episode_lengths': 97.5, 'average_episode_rewards': -214.25900286086735, 'average_loss': 0.001098420703783631, 'learning_rate': 0.004990008995201681, 'max_reward_video': {'_type': 'video-file', 'path': 'media/videos/max_reward_video_2_de1368eb4a9cffe45bc9.mp4', 'sha256': 'de1368eb4a9cffe45bc98fb1781e53c0619e8cb010e012a453b94f289e4f54ad', 'size': 10645}, 'simulated_iterations': 2, 'simulated_timesteps': 9719, 'validation_duration': 76.4}
