# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb
import numpy as np

from model.ppo3 import PPO
from env.wrappers import LunarContinuous
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarContinuous
}

Initialise wandb session in the code cell below.

In [3]:
MAX_TOTAL_TIMESTEPS_TO_TRAIN = 500
VAL_ITER = 20
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'validation_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        # 'lr_gamma': {
        #     'min': 0.999,
        #     'max': 1.
        # },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            "distribution": "q_log_uniform",
            "min": 0.1,
            "max": 10, 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 100))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        'num_steps': {
            'distribution': 'q_uniform',
            'min': 300,
            'max': 4000,
            'q': 100
        },'batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    },
     "constraints": [
        {"params": ["num_envs", "num_steps"], "max_product": MAX_TOTAL_TIMESTEPS_TO_TRAIN}
    ]
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train()

    val_rews, val_dur = ppo.validate(VAL_ITER, False)

    wandb.log({
        "validation_rewards": val_rews,
        "validation_duration": val_dur
        # "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 7ig4yvcl
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/7ig4yvcl


[34m[1mwandb[0m: Agent Starting Run: q6vwh2dc with config:
[34m[1mwandb[0m: 	anneal_lr: False
[34m[1mwandb[0m: 	batches: 146
[34m[1mwandb[0m: 	gamma: 0.9729489254725824
[34m[1mwandb[0m: 	lam: 0.9141777110536656
[34m[1mwandb[0m: 	lr: 0.05286455878929416
[34m[1mwandb[0m: 	max_grad_norm: 5591
[34m[1mwandb[0m: 	n_updates_per_iteration: 4
[34m[1mwandb[0m: 	num_envs: 12
[34m[1mwandb[0m: 	num_steps: 700
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Return: -199.22
Average Loss: 0.0006
Average KL Divergence: 0.005822817034249266
Iteration took: 12.53 secs, of which rollout took 10.25 secs and gradient updates took 2.28 secs
Current learning rate: 0.005
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -255.5
Average Loss: -0.00253
Average KL Divergence: 0.006206803589707131
Iteration took: 12.36 secs, of which rollout took 10.35 secs and gradient updates took 1.99 secs
Current learning rate: 0.00495
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -136.41
Average Loss: -0.00338
Average KL Divergence: 0.007447491814573717
Iteration took: 12.39 secs, of which rollout took 10.21 secs and gradient updates took 2.15 secs
Current learning rate: 0.004851
----------------------------------------

In [None]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'lr': 0.033773968633186116, 'lam': 0.965122224947915, 'gamma': 0.9391731546579618, 'lr_gamma': 0.99964198121568, 'max_grad_norm': 1.0549291822676827, 'n_sgd_batches': 8, 'timesteps_per_batch': 6600, 'n_updates_per_iteration': 17, 'max_timesteps_per_episode': 800}
Best Metrics: {'val_rewards': -117.685825451997, '_runtime': 18.8714706, '_step': 2, '_timestamp': 1733497014.9633105, '_wandb': {'runtime': 18}, 'average_episode_lengths': 97.5, 'average_episode_rewards': -214.25900286086735, 'average_loss': 0.001098420703783631, 'learning_rate': 0.004990008995201681, 'max_reward_video': {'_type': 'video-file', 'path': 'media/videos/max_reward_video_2_de1368eb4a9cffe45bc9.mp4', 'sha256': 'de1368eb4a9cffe45bc98fb1781e53c0619e8cb010e012a453b94f289e4f54ad', 'size': 10645}, 'simulated_iterations': 2, 'simulated_timesteps': 9719, 'validation_duration': 76.4}
