# PPO
---

### 1. Import the Necessary Packages

In [1]:
from model.ppo_3 import PPO
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
# hyperparameters = {
#     'timesteps_per_batch': 1024 ,                # Number of timesteps to run per batch
#     'max_timesteps_per_episode': 1200,           # Max number of timesteps per episode
#     'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
#     'lr': 2.5e-4 ,                                # Learning rate of actor optimizer
#     'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
# }
# hyperparameters = {'gamma': 0.999, 'lr_gamma': 0.995,
#                    'max_timesteps_per_episode': 1200,'lr': 0.005 }

hyperparameters = {}

misc_hyperparameters = {
    'env': LunarLanderWithUnknownWind
}

Initialise wandb session in the code cell below.

In [3]:
LOG = False
if LOG:
    logger = WandbSummaryWritter(project='lunar', config =misc_hyperparameters['env']().load_hyperparameters())
else:
    logger=None

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [4]:
checkpoint = 'ppo_checkpoints/non_wandb/ppo_policy_50.pth'
LOAD_MODEL = False

ppo = PPO(logger, **hyperparameters, **misc_hyperparameters)

if LOAD_MODEL:
    ppo.restore_savestate(checkpoint)

### 3. Train the Model

Train model for specified amount of timestamps.

In [5]:
ppo.train()


-------------------- Iteration #1 --------------------
Average Episodic Return: -397.05
Average Actor Loss: -0.20629
Average Critic Loss: 2664.5769790649415
Average KL Divergence: 0.013010116955802705
Iteration took: 2.97 secs, of which rollout took 2.55 secs and gradient updates took 0.42 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -495.71
Average Actor Loss: -0.20985
Average Critic Loss: 1861.0277457017166
Average KL Divergence: 0.010748129774835365
Iteration took: 3.43 secs, of which rollout took 2.62 secs and gradient updates took 0.44 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
Average Validation Return: -326.99
Average Validation Duration: 154.6 secs
------------------------------------------------------



### 4. Evaluate the Model

Run multiple episodes from pretrained model.

In [6]:
# ppo.validate(val_iter=30)
# ppo.device = 'cuda'
import numpy as np
val_rews, val_dur = ppo.validate(10, False, True, False)
# ppo.test()

# wind_vals, base_z, adpt_z = ppo.validate_encoders()

# print(wind_vals)
# print(base_z)
# print(adpt_z)