# PPO
---

### 1. Import the Necessary Packages

In [1]:
from model.ppo import PPO
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
# hyperparameters = {
#     'timesteps_per_batch': 1024 ,                # Number of timesteps to run per batch
#     'max_timesteps_per_episode': 1200,           # Max number of timesteps per episode
#     'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
#     'lr': 2.5e-4 ,                                # Learning rate of actor optimizer
#     'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
# }
# hyperparameters = {'gamma': 0.999, 'lr_gamma': 0.995,
#                    'max_timesteps_per_episode': 1200,'lr': 0.005 }

hyperparameters = {}

misc_hyperparameters = {
    'env': LunarContinuous
}

Initialise wandb session in the code cell below.

In [3]:
LOG = False
if LOG:
    logger = WandbSummaryWritter(project='lunar', config =misc_hyperparameters['env']().load_hyperparameters())
else:
    logger=None

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [4]:
checkpoint = 'ppo_checkpoints/non_wandb/ppo_policy_50.pth'
LOAD_MODEL = False

ppo = PPO(logger, **hyperparameters, **misc_hyperparameters)

if LOAD_MODEL:
    ppo.restore_savestate(checkpoint)

### 3. Train the Model

Train model for specified amount of timestamps.

In [5]:
ppo.train()


-------------------- Iteration #1 --------------------
Average Episodic Return: -141.93
Average Loss: 0.00406
Average KL Divergence: 0.008285522199003026
Iteration took: 9.72 secs, of which rollout took 8.05 secs and gradient updates took 1.67 secs
Current learning rate: 0.005
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -121.53
Average Loss: -8e-05
Average KL Divergence: 0.0079110931514466
Iteration took: 9.42 secs, of which rollout took 7.91 secs and gradient updates took 1.51 secs
Current learning rate: 0.00495
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -134.38
Average Loss: -0.00257
Average KL Divergence: 0.008167712063086351
Iteration took: 10.25 secs, of which rollout took 8.5 secs and gradient updates took 1.75 secs
Current learning rate: 0.004851
------------------------------------------------

UFuncTypeError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int32') with casting rule 'same_kind'

### 4. Evaluate the Model

Run multiple episodes from pretrained model.

In [None]:
# ppo.validate(val_iter=30)
ppo.device = 'cuda'
ppo.validate(10)
# ppo.test()

([array([ 55.08478653,  76.26484522, 100.0668116 ,  84.75946008,
         119.57279413, 122.86338491,  99.00859018,  82.65465626,
          62.38822349,  47.74654677])],
 [array([1201, 1201, 1201, 1201, 1201, 1201, 1201, 1201, 1201, 1201])])