# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb

from model.ppo_2 import PPO
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarLanderWithKnownWind
}

Initialise wandb session in the code cell below.

In [3]:
MAX_TOTAL_TIMESTEPS_TO_TRAIN = 500
VAL_ITER = 20
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'validation_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'actor_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'critic_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'adp_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            "min": 0.1,
            "max": 10., 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 200))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        'num_steps': {
            'distribution': 'q_uniform',
            'min': 300,
            'max': 1500,
            'q': 100
        },
        'adp_num_steps': {
            'distribution': 'q_uniform',
            'min': 200,
            'max': 1000,
            'q': 10
        },
        'anneal_discount': {
            'distribution': 'q_uniform',
            'min': 1,
            'max': 1000,
            'q': 10
        },
        'batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train()

    val_rews, val_dur = ppo.validate(VAL_ITER, True, True)
    ppo.env.close()
    wandb.log({
        "validation_rewards": val_rews,
        "validation_duration": val_dur
        # "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 13u9gqns
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/13u9gqns


[34m[1mwandb[0m: Agent Starting Run: qe2m2ooh with config:
[34m[1mwandb[0m: 	actor_lr: 0.08383349737953438
[34m[1mwandb[0m: 	adp_lr: 0.08241424225906509
[34m[1mwandb[0m: 	adp_num_steps: 930
[34m[1mwandb[0m: 	anneal_discount: 740
[34m[1mwandb[0m: 	anneal_lr: False
[34m[1mwandb[0m: 	batches: 72
[34m[1mwandb[0m: 	critic_lr: 0.015833762392584806
[34m[1mwandb[0m: 	gamma: 0.9007168266877504
[34m[1mwandb[0m: 	lam: 0.9684143868246466
[34m[1mwandb[0m: 	max_grad_norm: 8.605457036937478
[34m[1mwandb[0m: 	n_updates_per_iteration: 15
[34m[1mwandb[0m: 	num_envs: 71
[34m[1mwandb[0m: 	num_steps: 1600
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Return: -261.65
Average Actor Loss: -0.20565
Average Critic Loss: 1944.5666525033805
Average KL Divergence: 0.010875932283950254
Iteration took: 10.99 secs, of which rollout took 8.97 secs and gradient updates took 2.01 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -256.82
Average Actor Loss: -0.21023
Average Critic Loss: 1361.0319314809947
Average KL Divergence: 0.009142016275514975
Iteration took: 11.84 secs, of which rollout took 9.99 secs and gradient updates took 1.83 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #1 --------------------
Average adp Loss: 2.66335
Iteration took: 208.82 secs, of which rollout took 7.69 secs and g

  logger.warn(


MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video6-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video6-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video6-episode-0.mp4




MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video3-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video3-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video3-episode-0.mp4




MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video9-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video9-episode-0.mp4



                                                                       

MoviePy - Done !




MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video9-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video12-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video12-episode-0.mp4



                                                                       

MoviePy - Done !




MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video12-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video4-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video4-episode-0.mp4



                                                                       

MoviePy - Done !




MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video4-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video19-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video19-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video19-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video11-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video11-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video11-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video2-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video2-episode-0.mp4



                                                                       

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video2-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video13-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video13-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video13-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video8-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video8-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video8-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video14-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video14-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video14-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video7-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video7-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video7-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video17-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video17-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video17-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video16-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video16-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video16-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video10-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video10-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video10-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video1-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video1-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video1-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video5-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video5-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video5-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video18-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video18-episode-0.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video18-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video15-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video15-episode-0.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video15-episode-0.mp4
MoviePy - Building video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video0-episode-0.mp4.
MoviePy - Writing video c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video0-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready c:\Users\pmsar\git\tum-adlr-ws25-16\videos\rl-video0-episode-0.mp4


0,1
actor_learning_rate,▁▁
adp_learning_rate,▁
average_actor_loss,█▁
average_adapt_loss,▁
average_critic_loss,█▁
average_episode_rewards,▁█
critic_learning_rate,▁▁
simulated_iterations,▁█▁
validation_duration,▁
validation_rewards,▁

0,1
actor_learning_rate,0.005
adp_learning_rate,0.005
average_actor_loss,-0.21023
average_adapt_loss,2.66335
average_critic_loss,1361.03193
average_episode_rewards,-256.8183
critic_learning_rate,0.0075
iteration_compute,208.82
simulated_iterations,1.0
validation_duration,100.8


In [6]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'lam': 0.9650829007037264, 'gamma': 0.9876871229575696, 'adp_lr': 0.08048877951140997, 'batches': 124, 'actor_lr': 0.08648813825739048, 'num_envs': 10, 'anneal_lr': True, 'critic_lr': 0.013400647936257531, 'num_steps': 2900, 'adp_num_steps': 800, 'max_grad_norm': 7.196879928655128, 'anneal_discount': 610, 'n_updates_per_iteration': 3}
Best Metrics: {'_runtime': 31.2162513, '_step': 2, '_timestamp': 1734340538.4323287, '_wandb': {'runtime': 31}, 'actor_learning_rate': 0.005, 'adp_learning_rate': 0.005, 'average_actor_loss': -0.2136077562920176, 'average_adapt_loss': 3.3120418178889217, 'average_critic_loss': 735.8095331632173, 'average_episode_rewards': -219.5822296142578, 'critic_learning_rate': 0.0075, 'iteration_compute': '8.55', 'simulated_iterations': 1}
