# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb

from model.ppo_2 import PPO
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarLanderWithKnownWind
}

Initialise wandb session in the code cell below.

In [3]:
MAX_TOTAL_TIMESTEPS_TO_TRAIN = 500
VAL_ITER = 20
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'validation_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'actor_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'critic_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'adp_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            "min": 0.1,
            "max": 10, 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 100))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        'num_steps': {
            'distribution': 'q_uniform',
            'min': 300,
            'max': 4000,
            'q': 100
        },
        'adp_num_steps': {
            'distribution': 'q_uniform',
            'min': 200,
            'max': 1000,
            'q': 10
        },
        'anneal_discount': {
            'distribution': 'q_uniform',
            'min': 1,
            'max': 1000,
            'q': 10
        },
        'batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train()

    val_rews, val_dur = ppo.validate(VAL_ITER, False)

    wandb.log({
        "validation_rewards": val_rews,
        "validation_duration": val_dur
        # "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: yk0hcuv3
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/yk0hcuv3


[34m[1mwandb[0m: Agent Starting Run: dt6solfi with config:
[34m[1mwandb[0m: 	actor_lr: 0.07511353134200377
[34m[1mwandb[0m: 	adp_lr: 0.006720398486064176
[34m[1mwandb[0m: 	adp_num_steps: 680
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	batches: 940
[34m[1mwandb[0m: 	critic_lr: 0.0758837188329859
[34m[1mwandb[0m: 	gamma: 0.9426677298554624
[34m[1mwandb[0m: 	lam: 0.9383715033610108
[34m[1mwandb[0m: 	max_grad_norm: 64
[34m[1mwandb[0m: 	n_updates_per_iteration: 4
[34m[1mwandb[0m: 	num_envs: 90
[34m[1mwandb[0m: 	num_steps: 3900
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Return: -203.93
Average Actor Loss: -0.21083
Average Critic Loss: 1110.1379979647122
Average KL Divergence: 0.010224226825476553
Iteration took: 10.99 secs, of which rollout took 8.99 secs and gradient updates took 2.0 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -286.45
Average Actor Loss: -0.2147
Average Critic Loss: 809.7173865098218
Average KL Divergence: 0.00943904954939661
Iteration took: 11.47 secs, of which rollout took 9.49 secs and gradient updates took 1.96 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #1 --------------------
Average adp Loss: 2.46015
Iteration took: 8.13 secs, of which rollout took 7.49 secs and gradien

0,1
actor_learning_rate,▁▁
adp_learning_rate,▁
average_actor_loss,█▁
average_adapt_loss,▁
average_critic_loss,█▁
average_episode_rewards,█▁
critic_learning_rate,▁▁
simulated_iterations,▁█▁
validation_duration,▁
validation_rewards,▁

0,1
actor_learning_rate,0.005
adp_learning_rate,0.005
average_actor_loss,-0.2147
average_adapt_loss,2.46015
average_critic_loss,809.71739
average_episode_rewards,-286.45041
critic_learning_rate,0.0075
iteration_compute,8.13
simulated_iterations,1.0
validation_duration,94.65


[34m[1mwandb[0m: Agent Starting Run: ymtjzc3i with config:
[34m[1mwandb[0m: 	actor_lr: 0.06045633618669653
[34m[1mwandb[0m: 	adp_lr: 0.011641022868937928
[34m[1mwandb[0m: 	adp_num_steps: 990
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	batches: 610
[34m[1mwandb[0m: 	critic_lr: 0.08427680579865444
[34m[1mwandb[0m: 	gamma: 0.9655166989159883
[34m[1mwandb[0m: 	lam: 0.9051284899170832
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	n_updates_per_iteration: 17
[34m[1mwandb[0m: 	num_envs: 89
[34m[1mwandb[0m: 	num_steps: 1300



-------------------- Iteration #1 --------------------
Average Episodic Return: -229.58
Average Actor Loss: -0.21186
Average Critic Loss: 1286.7947630662184
Average KL Divergence: 0.012249548944572988
Iteration took: 12.34 secs, of which rollout took 10.5 secs and gradient updates took 1.84 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -153.72
Average Actor Loss: -0.21372
Average Critic Loss: 946.1518800295316
Average KL Divergence: 0.011448690700322353
Iteration took: 13.09 secs, of which rollout took 11.14 secs and gradient updates took 1.93 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #1 --------------------
Average adp Loss: 2.61578
Iteration took: 8.93 secs, of which rollout took 8.24 secs and gra

0,1
actor_learning_rate,▁▁
adp_learning_rate,▁
average_actor_loss,█▁
average_adapt_loss,▁
average_critic_loss,█▁
average_episode_rewards,▁█
critic_learning_rate,▁▁
simulated_iterations,▁█▁
validation_duration,▁
validation_rewards,▁

0,1
actor_learning_rate,0.005
adp_learning_rate,0.005
average_actor_loss,-0.21372
average_adapt_loss,2.61578
average_critic_loss,946.15188
average_episode_rewards,-153.71815
critic_learning_rate,0.0075
iteration_compute,8.93
simulated_iterations,1.0
validation_duration,119.45


[34m[1mwandb[0m: Agent Starting Run: 07sj19af with config:
[34m[1mwandb[0m: 	actor_lr: 0.07727284594022034
[34m[1mwandb[0m: 	adp_lr: 0.06563532485614942
[34m[1mwandb[0m: 	adp_num_steps: 340
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	batches: 240
[34m[1mwandb[0m: 	critic_lr: 0.004400780044034726
[34m[1mwandb[0m: 	gamma: 0.9768005088270716
[34m[1mwandb[0m: 	lam: 0.982104341104846
[34m[1mwandb[0m: 	max_grad_norm: 8
[34m[1mwandb[0m: 	n_updates_per_iteration: 11
[34m[1mwandb[0m: 	num_envs: 69
[34m[1mwandb[0m: 	num_steps: 3300



-------------------- Iteration #1 --------------------
Average Episodic Return: -173.09
Average Actor Loss: -0.2101
Average Critic Loss: 1285.3066014216497
Average KL Divergence: 0.009372806259824966
Iteration took: 11.38 secs, of which rollout took 9.41 secs and gradient updates took 1.97 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -159.82
Average Actor Loss: -0.21188
Average Critic Loss: 898.9989279820369
Average KL Divergence: 0.009518731126319416
Iteration took: 11.79 secs, of which rollout took 9.51 secs and gradient updates took 2.26 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #1 --------------------
Average adp Loss: 2.67086
Iteration took: 8.12 secs, of which rollout took 7.49 secs and gradi

In [None]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'lr': 0.033773968633186116, 'lam': 0.965122224947915, 'gamma': 0.9391731546579618, 'lr_gamma': 0.99964198121568, 'max_grad_norm': 1.0549291822676827, 'n_sgd_batches': 8, 'timesteps_per_batch': 6600, 'n_updates_per_iteration': 17, 'max_timesteps_per_episode': 800}
Best Metrics: {'val_rewards': -117.685825451997, '_runtime': 18.8714706, '_step': 2, '_timestamp': 1733497014.9633105, '_wandb': {'runtime': 18}, 'average_episode_lengths': 97.5, 'average_episode_rewards': -214.25900286086735, 'average_loss': 0.001098420703783631, 'learning_rate': 0.004990008995201681, 'max_reward_video': {'_type': 'video-file', 'path': 'media/videos/max_reward_video_2_de1368eb4a9cffe45bc9.mp4', 'sha256': 'de1368eb4a9cffe45bc98fb1781e53c0619e8cb010e012a453b94f289e4f54ad', 'size': 10645}, 'simulated_iterations': 2, 'simulated_timesteps': 9719, 'validation_duration': 76.4}
