# PPO
---

### 1. Import the Necessary Packages

In [1]:
import gym
import torch
%matplotlib inline
import wandb

from model.ppo_parallel import PPO
from model.network import ActorCritic

### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below.

In [2]:
env = gym.make('LunarLanderContinuous-v2',render_mode='human')
def make_env():
    return gym.make('LunarLanderContinuous-v2',render_mode='human')
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.shape)

State shape:  (8,)
Number of actions:  (2,)


Setup the hyperparameters in the code cell below.

In [None]:
hyperparameters = {
    'timesteps_per_batch': 4800 ,                # Number of timesteps to run per batch
    'max_timesteps_per_episode': 1600,           # Max number of timesteps per episode
    'n_updates_per_iteration': 5,                # Number of times to update actor/critic per iteration
    'lr': 0.005 ,                                # Learning rate of actor optimizer
    'gamma': 0.95,                               # Discount factor to be applied when calculating Rewards-To-Go
    'clip': 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
}

misc_hyperparameters = {
    'render': True,                              # If we should render during rollout
    'render_every_i': 10 ,                       # Only render every n iterations
    'save_freq': 10  ,                           # How often we save in number of iterations
    'num_workers': 2  ,
    'seed': None 
}

Initialise wandb session.

In [4]:
wandb.login()
run = wandb.init(
    # Set the project where this run will be logged
    project="lunar",
    # Track hyperparameters and run metadata
    config= hyperparameters
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin


Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:

LOAD_MODEL = False
model = None

if LOAD_MODEL:
    checkpoint = 'ppo_parallel_checkpoints/ppo_policy_5.pth'
    model = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0])
    model.load_state_dict(torch.load(checkpoint))

ppo = PPO(model=model, **hyperparameters, **misc_hyperparameters)


-------------------- Iteration #1 --------------------
Average Episodic Length: 110.34
Average Episodic Return: -280.51
Average Loss: -0.00156
Timesteps So Far: 4855
Iteration took: 57.61 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 110.67
Average Episodic Return: -243.2
Average Loss: -0.00279
Timesteps So Far: 9835
Iteration took: 62.7 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 111.2
Average Episodic Return: -182.87
Average Loss: -0.00226
Timesteps So Far: 14839
Iteration took: 63.2 secs
------------------------------------------------------



Train model for specified amount of timestamps.

In [None]:
total_timesteps_to_train = 20000
ppo.learn(total_timesteps_to_train)

In [None]:
model.learn(600000)


-------------------- Iteration #78 --------------------
Average Episodic Length: 1251.8
Average Episodic Return: -87.72
Average Loss: -2e-05
Timesteps So Far: 409072
Iteration took: 613.35 secs
------------------------------------------------------


-------------------- Iteration #79 --------------------
Average Episodic Length: 1600.0
Average Episodic Return: 4.47
Average Loss: -2e-05
Timesteps So Far: 413872
Iteration took: 99.37 secs
------------------------------------------------------


-------------------- Iteration #80 --------------------
Average Episodic Length: 1600.0
Average Episodic Return: 43.11
Average Loss: -2e-05
Timesteps So Far: 418672
Iteration took: 98.96 secs
------------------------------------------------------


-------------------- Iteration #81 --------------------
Average Episodic Length: 1600.0
Average Episodic Return: 43.65
Average Loss: -2e-05
Timesteps So Far: 423472
Iteration took: 98.72 secs
------------------------------------------------------


--

In [None]:
while True:
		obs, _ = env.reset()
		done = False

		# number of timesteps so far
		t = 0

		# Logging data
		ep_len = 0            # episodic length
		ep_ret = 0            # episodic return

		while not done:
			t += 1

			# Query deterministic action from policy and run it
			action, _ = model.policy(obs)
			obs, rew, terminated, truncated, _ = env.step(action.detach().numpy())
			done = terminated | truncated

			# Sum all episodic rewards as we go along
			ep_ret += rew
			
		# Track episodic length
		ep_len = t

KeyboardInterrupt: 