# PPO
---

### 1. Import the Necessary Packages

In [1]:
import gym
import torch
%matplotlib inline
import wandb

from model.ppo_parallel import PPO
from model.network import ActorCritic
from model.environments import LunarContinuous

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'render': True,                              # If we should render during rollout
    'render_every_i': 10 ,                       # Only render every n iterations
    'save_freq': 10  ,                           # How often we save in number of iterations
    'num_workers': 8  ,
    'seed': None 
}

Initialise wandb session in the code cell below.

In [3]:
total_timesteps_to_train =  450_000
sweep_config = {
    'method': 'bayes',  # Could also be 'random' or 'grid'
    'metric': {
        'name': 'average_episode_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'lr': {
            'min': 0.001,
            'max': 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'max_timesteps_per_episode': {
            'values': list(range(0, 2001, 200))
        }
    }
}

In [4]:
def train_model(config = None):
    wandb.login()
    run = wandb.init(
        # Track hyperparameters and run metadata
        config= config
    )
    ppo = ppo = PPO(**misc_hyperparameters) if config is None else PPO( lr = config.lr, gamma = config.gamma, n_updates_per_iteration = config.n_updates_per_iteration, max_timesteps_per_episode = config.max_timesteps_per_episode, **misc_hyperparameters)
    ppo.learn(total_timesteps_to_train)

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 8zeor1qf
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/8zeor1qf


[34m[1mwandb[0m: Agent Starting Run: zfnrrt3w with config:
[34m[1mwandb[0m: 	gamma: 0.935328622614964
[34m[1mwandb[0m: 	lr: 0.04560287565550175
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1400
[34m[1mwandb[0m: 	n_updates_per_iteration: 5
[34m[1mwandb[0m: Currently logged in as: [33mpmsaraiva2712[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Length: 118.26
Average Episodic Return: -256.97
Average Loss: -0.00542
Timesteps So Far: 5440
Iteration took: 11.85 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 120.45
Average Episodic Return: -196.27
Average Loss: -0.00423
Timesteps So Far: 10740
Iteration took: 33.45 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 122.32
Average Episodic Return: -153.11
Average Loss: -0.00398
Timesteps So Far: 16122
Iteration took: 35.65 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 135.85
Average Episodic Return: -207.43
Average Loss: -0.00365
Timesteps So Far: 21692
Iteration took: 37.55 secs
-----------------------------------------------------

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▂▂▁▂▃▂▄▄▆▃▆▆▇▆▇▇▇▇█▇▇▇▆█▇▇▇▇█▇▇▇
average_episode_rewards,▁▃▄▂▄▄▅▅▅▅▅▆▆▇▇▇█▆▇▇█▇█▆▇█▅▇▅▇▇▇▇█▇▆▇█▇▇
average_loss,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
simulated_iterations,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇███

0,1
average_episode_lengths,1471.875
average_episode_rewards,-2.88998
average_loss,-0.00188
simulated_iterations,46.0
simulated_timesteps,459431.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: u6rohf5x with config:
[34m[1mwandb[0m: 	gamma: 0.9467309306581204
[34m[1mwandb[0m: 	lr: 0.06460251309677582
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1400
[34m[1mwandb[0m: 	n_updates_per_iteration: 8



-------------------- Iteration #1 --------------------
Average Episodic Length: 107.21
Average Episodic Return: -376.76
Average Loss: 0.00243
Timesteps So Far: 5146
Iteration took: 11.02 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 99.21
Average Episodic Return: -376.55
Average Loss: -0.0004
Timesteps So Far: 10404
Iteration took: 33.2 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 108.13
Average Episodic Return: -280.75
Average Loss: -0.00115
Timesteps So Far: 15486
Iteration took: 32.83 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 106.73
Average Episodic Return: -199.86
Average Loss: -0.00149
Timesteps So Far: 20716
Iteration took: 33.03 secs
------------------------------------------------------




0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▁▁▂▄▃▄▃▆▅▇███▇█▇▇████▆▇█▇█▇▇▇▆▇
average_episode_rewards,▁▁▃▄▆▅▆▆▇▇▆▇▇▇▇▆▇▆▇▇▇▇██▇▇█▇██▆▆▇▇▇▇▇▆▅▆
average_loss,█▄▃▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂
simulated_iterations,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
average_episode_lengths,1369.77778
average_episode_rewards,-112.78526
average_loss,-0.00159
simulated_iterations,45.0
simulated_timesteps,450545.0


[34m[1mwandb[0m: Agent Starting Run: js973cl7 with config:
[34m[1mwandb[0m: 	gamma: 0.9337008226719842
[34m[1mwandb[0m: 	lr: 0.059858302505945815
[34m[1mwandb[0m: 	max_timesteps_per_episode: 200
[34m[1mwandb[0m: 	n_updates_per_iteration: 12



-------------------- Iteration #1 --------------------
Average Episodic Length: 133.65
Average Episodic Return: -218.04
Average Loss: 0.00124
Timesteps So Far: 6148
Iteration took: 21.41 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 105.74
Average Episodic Return: -199.91
Average Loss: -0.00031
Timesteps So Far: 11435
Iteration took: 35.18 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 99.77
Average Episodic Return: -158.6
Average Loss: -0.00067
Timesteps So Far: 16723
Iteration took: 34.63 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 109.23
Average Episodic Return: -172.32
Average Loss: -0.0011
Timesteps So Far: 21857
Iteration took: 34.7 secs
------------------------------------------------------


-

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▁▁▂▂▃▄▃▄▆▆▅▇▇██████████████████
average_episode_rewards,▁▁▃▂▃▃▄▄▅▅▆▆▅▆▆▆▅▇▇▅▇███▇▆█▅▅▅▃▆▇▅▄▄▄▄▅▅
average_loss,█▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██

0,1
average_episode_lengths,1600.0
average_episode_rewards,-57.45316
average_loss,-0.00117
simulated_iterations,44.0
simulated_timesteps,461464.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: da1vi2xg with config:
[34m[1mwandb[0m: 	gamma: 0.9075777897661136
[34m[1mwandb[0m: 	lr: 0.042695884390153815
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1000
[34m[1mwandb[0m: 	n_updates_per_iteration: 17



-------------------- Iteration #1 --------------------
Average Episodic Length: 113.61
Average Episodic Return: -383.08
Average Loss: -0.00195
Timesteps So Far: 5226
Iteration took: 13.28 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 125.24
Average Episodic Return: -307.96
Average Loss: -0.00191
Timesteps So Far: 10486
Iteration took: 37.11 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 144.23
Average Episodic Return: -263.59
Average Loss: -0.00213
Timesteps So Far: 16111
Iteration took: 39.1 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 133.45
Average Episodic Return: -222.29
Average Loss: -0.00218
Timesteps So Far: 21449
Iteration took: 36.96 secs
------------------------------------------------------

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▂▁▃▂▂▂▃▅▄▅▅▆▇▇█████████████████▇
average_episode_rewards,▁▂▃▄▄▄▅▅▅▆▆▆▆▇▇▇▆▆▇▇▇▇██████▇▇▇▇▆▇▇▇▇▇▇▆
average_loss,▄▅▃▂▂▁▁▂▃▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▇████████
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇██

0,1
average_episode_lengths,1479.0
average_episode_rewards,-52.9281
average_loss,-0.00154
simulated_iterations,44.0
simulated_timesteps,460168.0


[34m[1mwandb[0m: Agent Starting Run: 8s0g359g with config:
[34m[1mwandb[0m: 	gamma: 0.9800556456598452
[34m[1mwandb[0m: 	lr: 0.07222313300120063
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1400
[34m[1mwandb[0m: 	n_updates_per_iteration: 11



-------------------- Iteration #1 --------------------
Average Episodic Length: 106.04
Average Episodic Return: -248.77
Average Loss: -0.00149
Timesteps So Far: 5408
Iteration took: 13.21 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 113.83
Average Episodic Return: -224.54
Average Loss: -0.00224
Timesteps So Far: 10872
Iteration took: 36.96 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 110.64
Average Episodic Return: -177.64
Average Loss: -0.00223
Timesteps So Far: 16072
Iteration took: 37.05 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 149.5
Average Episodic Return: -162.04
Average Loss: -0.00214
Timesteps So Far: 22351
Iteration took: 43.28 secs
------------------------------------------------------

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▄▄▆▄▅▇▇▇█▇▇█▇▇████▇██
average_episode_rewards,▁▂▃▃▄▄▄▅▅▅▅▅▆▅▄▄▄▆▅▅▆▇▆▇▇█▇█▇▆▇▅▇▄▄▄▅▆▆▆
average_loss,█▃▃▄▃▂▁▁▂▁▁▂▂▂▂▂▂▃▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆
simulated_iterations,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███

0,1
average_episode_lengths,1600.0
average_episode_rewards,-40.28866
average_loss,-0.00181
simulated_iterations,46.0
simulated_timesteps,450849.0


[34m[1mwandb[0m: Agent Starting Run: nz8rr8u1 with config:
[34m[1mwandb[0m: 	gamma: 0.941836778409553
[34m[1mwandb[0m: 	lr: 0.06305825753811355
[34m[1mwandb[0m: 	max_timesteps_per_episode: 800
[34m[1mwandb[0m: 	n_updates_per_iteration: 4



-------------------- Iteration #1 --------------------
Average Episodic Length: 105.66
Average Episodic Return: -295.73
Average Loss: -0.0015
Timesteps So Far: 5283
Iteration took: 13.5 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 115.82
Average Episodic Return: -274.95
Average Loss: -0.00255
Timesteps So Far: 10495
Iteration took: 36.69 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 116.36
Average Episodic Return: -178.59
Average Loss: -0.00245
Timesteps So Far: 15731
Iteration took: 37.15 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 126.33
Average Episodic Return: -179.73
Average Loss: -0.00264
Timesteps So Far: 21163
Iteration took: 37.38 secs
------------------------------------------------------


0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▁▂▂▃▄▅▆▆▇▇▇██▇████▇███████████▇
average_episode_rewards,▁▁▄▄▅▅▆▆▆▆▆▇▆▇▆▇▇▇▇▆█▇█▇█▇▇██▇▇▇▆▆▆▇▆▆▆▇
average_loss,█▂▂▁▁▁▁▂▂▃▃▃▃▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
simulated_iterations,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇██

0,1
average_episode_lengths,1491.0
average_episode_rewards,-31.96405
average_loss,-0.00151
simulated_iterations,44.0
simulated_timesteps,460745.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jkfvmcz7 with config:
[34m[1mwandb[0m: 	gamma: 0.9903269163765408
[34m[1mwandb[0m: 	lr: 0.0804439865288309
[34m[1mwandb[0m: 	max_timesteps_per_episode: 800
[34m[1mwandb[0m: 	n_updates_per_iteration: 13



-------------------- Iteration #1 --------------------
Average Episodic Length: 105.48
Average Episodic Return: -224.62
Average Loss: -0.00285
Timesteps So Far: 5063
Iteration took: 13.36 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 112.37
Average Episodic Return: -194.64
Average Loss: -0.00335
Timesteps So Far: 10232
Iteration took: 36.89 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 111.83
Average Episodic Return: -168.65
Average Loss: -0.0031
Timesteps So Far: 15600
Iteration took: 37.28 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 122.58
Average Episodic Return: -157.35
Average Loss: -0.00302
Timesteps So Far: 20871
Iteration took: 37.01 secs
------------------------------------------------------

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▃▆▆▅█▆▅▆▆▇▅▇▆▅▅▄▅▆▇█▇▇▆██
average_episode_rewards,▁▂▂▃▄▄▄▅▆▅▅▆▅▅▆▅▇▆▆█▇▆▆▆█▅▆▅▅▆▄▄▃▄▃▅▅▄▆▅
average_loss,▃▁▂▂▁▂▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇████████
simulated_iterations,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██

0,1
average_episode_lengths,1425.5
average_episode_rewards,-64.50533
average_loss,-0.00145
simulated_iterations,45.0
simulated_timesteps,450422.0


[34m[1mwandb[0m: Agent Starting Run: skonqokv with config:
[34m[1mwandb[0m: 	gamma: 0.954257570099312
[34m[1mwandb[0m: 	lr: 0.07002971611014354
[34m[1mwandb[0m: 	max_timesteps_per_episode: 1800
[34m[1mwandb[0m: 	n_updates_per_iteration: 2



-------------------- Iteration #1 --------------------
Average Episodic Length: 109.92
Average Episodic Return: -292.9
Average Loss: -0.002
Timesteps So Far: 5276
Iteration took: 13.62 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 113.91
Average Episodic Return: -267.76
Average Loss: -0.00222
Timesteps So Far: 10516
Iteration took: 37.1 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 119.44
Average Episodic Return: -161.96
Average Loss: -0.00214
Timesteps So Far: 15891
Iteration took: 36.98 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 120.34
Average Episodic Return: -176.34
Average Loss: -0.0026
Timesteps So Far: 21186
Iteration took: 37.26 secs
------------------------------------------------------


-

0,1
average_episode_lengths,▁▁▁▁▁▁▁▁▁▁▁▂▄▄▅▄▇▅▆▇▇▆█▆▇▇██████▆▇█▇▆█▇▇
average_episode_rewards,▁▂▄▄▅▅▆▅▆▇▇▇▇▇▇▆▇▆▇▇▆▇█▇█▇▇▆▆▆▆▇▇▆▇▇▆▇▆▄
average_loss,▅▄▄▁▃▄▄▄▃▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█
simulated_iterations,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
simulated_timesteps,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
average_episode_lengths,1397.88889
average_episode_rewards,-142.36776
average_loss,-0.00154
simulated_iterations,43.0
simulated_timesteps,454080.0


[34m[1mwandb[0m: Agent Starting Run: tfix9bmd with config:
[34m[1mwandb[0m: 	gamma: 0.9205507111770382
[34m[1mwandb[0m: 	lr: 0.09122626625467364
[34m[1mwandb[0m: 	max_timesteps_per_episode: 600
[34m[1mwandb[0m: 	n_updates_per_iteration: 7



-------------------- Iteration #1 --------------------
Average Episodic Length: 108.16
Average Episodic Return: -184.09
Average Loss: -0.00351
Timesteps So Far: 5300
Iteration took: 14.28 secs
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Length: 150.52
Average Episodic Return: -192.38
Average Loss: -0.00368
Timesteps So Far: 11622
Iteration took: 37.4 secs
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Length: 111.57
Average Episodic Return: -131.71
Average Loss: -0.00311
Timesteps So Far: 16866
Iteration took: 34.16 secs
------------------------------------------------------


-------------------- Iteration #4 --------------------
Average Episodic Length: 117.6
Average Episodic Return: -131.73
Average Loss: -0.00304
Timesteps So Far: 22158
Iteration took: 32.96 secs
------------------------------------------------------
