# PPO
---

### 1. Import the Necessary Packages

In [None]:
import gym
import torch
%matplotlib inline
import wandb
import numpy as np
import os

from model.ppo_parallel import PPO
from model.network import ActorCritic
from env.wrappers import LunarContinuous
from gym.wrappers import RecordVideo
from logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 50  ,                           # How often we save in number of iterations
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [3]:
TOTAL_TIMESTEPS_TO_TRAIN = 500_000
VAL_ITER = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'val_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            'min': 0.0001,
            'max': 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'lr_gamma': {
            'min': 0.999,
            'max': 1.
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'max_timesteps_per_episode': {
            'values': list(range(600, 2001, 200))
        },
        'timesteps_per_batch': {
            'values': list(range(600, 5001, 200))
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, lr = config.lr, gamma = config.gamma, n_updates_per_iteration = config.n_updates_per_iteration, max_timesteps_per_episode = config.max_timesteps_per_episode, **misc_hyperparameters)
    ppo.train(TOTAL_TIMESTEPS_TO_TRAIN)

    env = LunarContinuous().make_environment_for_recording()
    val_rews, val_dur = ppo.validate(VAL_ITER,env)

    wandb.log({
        "val_rewards": np.mean(val_rews),
        "validation_duration": np.mean(val_dur),
        "max_reward_video": wandb.Video(f"videos\\rl-video-episode-{np.argmax(val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: znsn2cbw
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/znsn2cbw


[34m[1mwandb[0m: Agent Starting Run: yww6epaf with config:
[34m[1mwandb[0m: 	gamma: 0.9483407015108342
[34m[1mwandb[0m: 	lr: 0.05268468266715958
[34m[1mwandb[0m: 	lr_gamma: 0.9992078042272966
[34m[1mwandb[0m: 	max_timesteps_per_episode: 2000
[34m[1mwandb[0m: 	n_updates_per_iteration: 2
[34m[1mwandb[0m: 	timesteps_per_batch: 4200
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Length: 98.31
Average Episodic Return: -176.43
Average Loss: 0.00131
Timesteps So Far: 4817
Iteration took: 7.11 secs
Current learning rate: 0.0049950019996000405
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 104.61
Average Episodic Return: -178.45
Average Loss: -0.00119
Timesteps So Far: 9629
Iteration took: 6.9 secs
Current learning rate: 0.004990008995201681
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 134.5
Average Episodic Return: -141.81
Average Loss: -0.00169
Timesteps So Far: 14471
Iteration took: 7.98 secs
Current learning rate: 0.004985020981810917
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 108.62
Average Episodic Return: -120.7
Ave