# PPO
---

### 1. Import the Necessary Packages

In [1]:
import gym
import torch
%matplotlib inline
import wandb

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarContinuous

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [6]:
# hyperparameters = {
#     'timesteps_per_batch': 1024 ,                # Number of timesteps to run per batch
#     'max_timesteps_per_episode': 1200,           # Max number of timesteps per episode
#     'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
#     'lr': 2.5e-4 ,                                # Learning rate of actor optimizer
#     'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
#     'clip': 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
# }
hyperparameters = {'timesteps_per_batch': 4096, 'batch_size': 32, 'gamma': 0.9995, 'lr_gamma': 0.999,
							'clip_range': 0.2, 'lr': 0.005 }

misc_hyperparameters = {
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [7]:
wandb.login()
run = wandb.init(
    # Set the project where this run will be logged
    project="lunar",
    # Track hyperparameters and run metadata
    config= hyperparameters
)



0,1
average_episode_lengths,▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▃█▁▂▂▂▁▁▁█▁▄▂▂▄▄▂▃▂▅▂▃▆▆▄
average_episode_rewards,▁▁▃▃▃▄▄▅▆▆▆▅▆▇▆▆█▇▇█▇▇▄▇▇▇▅█▁▇██▇▇█▇▇▆██
average_loss,▁▁▃▃▄▅▅▄▃▄▅▅▅▅▅▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██████
learning_rate,███▇▇▇▇▇▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
simulated_iterations,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇████
simulated_timesteps,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇█

0,1
average_episode_lengths,266.52381
average_episode_rewards,22.54725
average_loss,-0.00061
learning_rate,0.00012
simulated_iterations,142.0
simulated_timesteps,629612.0


Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [8]:

LOAD_MODEL = False
model = None

if LOAD_MODEL:
    checkpoint = 'ppo_parallel_checkpoints/ppo_policy_5.pth'
    env = LunarContinuous().make_environment()
    model = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0])
    model.load_state_dict(torch.load(checkpoint))

ppo = PPO( **hyperparameters, **misc_hyperparameters)

### 3. Train the Model

Train model for specified amount of timestamps.

In [9]:
total_timesteps_to_train =  2_000_000
ppo.learn(total_timesteps_to_train)


-------------------- Iteration #1 --------------------
Average Episodic Length: 125.33
Average Episodic Return: -316.23
Average Loss: 0.0027
Timesteps So Far: 4136
Iteration took: 6.76 secs
Current learning rate: 0.004975049950024995
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 128.33
Average Episodic Return: -297.82
Average Loss: 0.00011
Timesteps So Far: 8371
Iteration took: 6.66 secs
Current learning rate: 0.0049502244010487416
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 109.97
Average Episodic Return: -256.12
Average Loss: -0.0011
Timesteps So Far: 12550
Iteration took: 5.19 secs
Current learning rate: 0.00492552273181001
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 166.64
Average Episodic Return: -218.33
Ave

KeyboardInterrupt: 

### 4. Evaluate the Model

Run multiple episodes from pretrained model.

In [6]:
env = LunarContinuous(render_mode='human').make_environment()
while True:
		obs, _ = env.reset()
		done = False

		# number of timesteps so far
		t = 0

		ep_len = 0            # episodic length
		ep_ret = 0            # episodic return

		while not done:
			t += 1

			# Query deterministic action from policy and run it
			action = ppo.actor(obs)
			obs, rew, terminated, truncated, _ = env.step(action.detach().numpy())
			done = terminated | truncated

			# Sum all episodic rewards as we go along
			ep_ret += rew
			
		# Track episodic length
		ep_len = t

KeyboardInterrupt: 