# PPO
---

### 1. Import the Necessary Packages

In [14]:
import torch
%matplotlib inline

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarLanderWithUnknownWind
from logger import WandbSummaryWritter 

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [10]:
# hyperparameters = {
#     'timesteps_per_batch': 1024 ,                # Number of timesteps to run per batch
#     'max_timesteps_per_episode': 1200,           # Max number of timesteps per episode
#     'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
#     'lr': 2.5e-4 ,                                # Learning rate of actor optimizer
#     'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
#     'clip': 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
# }
hyperparameters = {'gamma': 0.999, 'lr_gamma': 0.995,
                   'max_timesteps_per_episode': 1600,
							'clip_range': 0.2, 'lr': 0.005 }

misc_hyperparameters = {
    'num_workers': 2  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [15]:
LOG = None
if LOG:
    logger = WandbSummaryWritter(project='lunar', config =hyperparameters)
else:
    logger=None

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [16]:
checkpoint = 'ppo_parallel_checkpoints/charmed-armadillo-108/ppo_policy_960.pth'
LOAD_MODEL = False

ppo = PPO(logger, LunarLanderWithUnknownWind, **hyperparameters, **misc_hyperparameters)

if LOAD_MODEL:
    pass
    # env = LunarContinuous().make_environment()
    # model = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0])
    # model.load_state_dict(torch.load(checkpoint))



### 3. Train the Model

Train model for specified amount of timestamps.

In [17]:
total_timesteps_to_train =  1_000_000

ppo.train(total_timesteps_to_train)


-------------------- Iteration #1 --------------------
Average Episodic Length: 93.25
Average Episodic Return: -320.26
Average Loss: 0.0137
Timesteps So Far: 4849
Iteration took: 8.01 secs
Current learning rate: 0.004876243765609375
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 92.83
Average Episodic Return: -368.0
Average Loss: 0.00674
Timesteps So Far: 9676
Iteration took: 7.92 secs
Current learning rate: 0.004755550652328859
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 103.66
Average Episodic Return: -304.84
Average Loss: 0.00415
Timesteps So Far: 14548
Iteration took: 8.07 secs
Current learning rate: 0.004637844844091639
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 103.45
Average Episodic Return: -352.49
Averag

KeyboardInterrupt: 

### 4. Evaluate the Model

Run multiple episodes from pretrained model.

In [None]:
ppo.test()

KeyboardInterrupt: 