# PPO
---

### 1. Import the Necessary Packages

In [1]:
import gym
import torch
%matplotlib inline
import wandb
import numpy as np
import os

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarContinuous
from gym.wrappers import RecordVideo
from logger import WandbSummaryWritter

from model.environments import LunarLanderWithKnownWind, LunarLanderWithUnknownWind

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 100  ,                           # How often we save in number of iterations
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [3]:
TOTAL_TIMESTEPS_TO_TRAIN = 1_000_000
VAL_ITER = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'val_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            'min': 0.004,
            'max': 0.006
        },
       'gamma': {
            'distribution': 'uniform',  # Use uniform instead of normal to respect min/max
            'min': 0.9,
            'max': 1.0
        },
        'lr_gamma': {
            'distribution': 'uniform',  # Use uniform instead of normal
            'min': 0.9,
            'max': 1.0
        },
        'lam': {
            'distribution': 'uniform',  # Use uniform instead of normal
            'min': 0.9,
            'max': 1.0
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'max_timesteps_per_episode': {
            'values': list(range(600, 2001, 200))
        },
        'timesteps_per_batch': {
            'values': list(range(600, 5001, 200))
        },
        'n_sgd_batches': {
            'values': list(range(1, 17))
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train(TOTAL_TIMESTEPS_TO_TRAIN)

    val_rews, val_dur = ppo.validate(VAL_ITER, True)

    wandb.log({
        "val_rewards": np.mean(val_rews),
        "validation_duration": np.mean(val_dur),
        "max_reward_video": wandb.Video(f"./videos/rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=20)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: v4fw131s
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/v4fw131s


[34m[1mwandb[0m: Agent Starting Run: it58ivuu with config:
[34m[1mwandb[0m: 	gamma: 0.9992287987109416
[34m[1mwandb[0m: 	lam: 0.9723586650838604
[34m[1mwandb[0m: 	lr: 0.005224464224000474
[34m[1mwandb[0m: 	lr_gamma: 0.9962441162469476
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1400
[34m[1mwandb[0m: 	n_sgd_batches: 1
[34m[1mwandb[0m: 	n_updates_per_iteration: 8
[34m[1mwandb[0m: 	timesteps_per_batch: 4400
[34m[1mwandb[0m: Currently logged in as: [33mmohamedrostom[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Length: 111.0
Average Episodic Return: -230.34
Average Loss: 0.01301
Timesteps So Far: 4884
Iteration took: 5.68 secs
Current learning rate: 0.0049950019996000405
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 114.53
Average Episodic Return: -282.88
Average Loss: 0.0054
Timesteps So Far: 9809
Iteration took: 5.91 secs
Current learning rate: 0.004990008995201681
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 106.78
Average Episodic Return: -211.54
Average Loss: 0.00272
Timesteps So Far: 14614
Iteration took: 5.8 secs
Current learning rate: 0.004985020981810917
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 103.87
Average Episodic Return: -149.61
Aver

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.



-------------------- Iteration #6 --------------------
Average Episodic Length: 116.55
Average Episodic Return: -112.02
Average Loss: -0.00025
Timesteps So Far: 29269
Iteration took: 6.01 secs
Current learning rate: 0.004970086837819016
------------------------------------------------------

