# PPO
---

### 1. Import the Necessary Packages

In [1]:
import gym
import torch
%matplotlib inline
import wandb

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarContinuous

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
hyperparameters = {
    'timesteps_per_batch': 4800 ,                # Number of timesteps to run per batch
    'max_timesteps_per_episode': 1600,           # Max number of timesteps per episode
    'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
    'lr': 0.005 ,                                # Learning rate of actor optimizer
    'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
    'clip': 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
}

misc_hyperparameters = {
    'render': True,                              # If we should render during rollout
    'render_every_i': 10 ,                       # Only render every n iterations
    'save_freq': 10  ,                           # How often we save in number of iterations
    'num_workers': 2  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [3]:
wandb.login()
run = wandb.init(
    # Set the project where this run will be logged
    project="lunar",
    # Track hyperparameters and run metadata
    config= hyperparameters
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin


Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:

LOAD_MODEL = False
model = None

if LOAD_MODEL:
    checkpoint = 'ppo_parallel_checkpoints/ppo_policy_5.pth'
    env = LunarContinuous().make_environment()
    model = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0])
    model.load_state_dict(torch.load(checkpoint))

ppo = PPO(model=model, **hyperparameters, **misc_hyperparameters)

### 3. Train the Model

Train model for specified amount of timestamps.

In [5]:
total_timesteps_to_train = 20000
ppo.learn(total_timesteps_to_train)


-------------------- Iteration #1 --------------------
Average Episodic Length: 156.14
Average Episodic Return: -257.36
Average Loss: -0.00463
Timesteps So Far: 5621
Iteration took: 71.47 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 120.29
Average Episodic Return: -226.45
Average Loss: -0.00398
Timesteps So Far: 10553
Iteration took: 63.41 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 115.48
Average Episodic Return: -167.26
Average Loss: -0.00374
Timesteps So Far: 15403
Iteration took: 62.0 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 156.8
Average Episodic Return: -156.99
Average Loss: -0.00373
Timesteps So Far: 20891
Iteration took: 69.55 secs
------------------------------------------------------


### 4. Evaluate the Model

Run multiple episodes from pretrained model.

In [6]:
env = LunarContinuous().make_environment()
while True:
		obs, _ = env.reset()
		done = False

		# number of timesteps so far
		t = 0

		ep_len = 0            # episodic length
		ep_ret = 0            # episodic return

		while not done:
			t += 1

			# Query deterministic action from policy and run it
			action, _ = ppo.policy(obs)
			obs, rew, terminated, truncated, _ = env.step(action.detach().numpy())
			done = terminated | truncated

			# Sum all episodic rewards as we go along
			ep_ret += rew
			
		# Track episodic length
		ep_len = t

KeyboardInterrupt: 