In [1]:
# You can skip this cell. It is only to present the environment
import gym

# Create the environment
env = gym.make('LunarLander-v2')  # continuous: LunarLanderContinuous-v2

# required before you can step the environment
env.reset()

# sample action:
print("sample action:", env.action_space.sample())

# observation space shape:
print("observation space shape:", env.observation_space.shape)

# sample observation:
print("sample observation:", env.observation_space.sample())

env.close()

# Our actions are discrete, and just 1 discrete action of 0, 1, 2, or 3. 0 means do nothing, 1 means fire the left engine, 
# 2 means fire the bottom engine, and 3 means fire the right engine

sample action: 2
observation space shape: (8,)
sample observation: [ 0.28221577 -0.21086727 -2.7981162   1.838166    0.85746425 -0.873713
  0.84509     0.15653042]


In [2]:
env.close()

In [3]:
# RL using A2C 

from stable_baselines3 import A2C

In [4]:
# Model creation and learning

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)    # 10000 steps for learning


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  if not isinstance(terminated, (bool, np.bool8)):


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 114      |
|    ep_rew_mean        | -332     |
| time/                 |          |
|    fps                | 517      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.25    |
|    explained_variance | 0.0231   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -9.89    |
|    value_loss         | 92.2     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 99.6     |
|    ep_rew_mean        | -389     |
| time/                 |          |
|    fps                | 511      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1a94ad1ae90>

In [11]:
# Testing the results with 10 episodes
episodes = 10

for ep in range(episodes):
    obs, info = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info, a = env.step(action)
        env.render()
        print(rewards)

  gym.logger.warn(


1.487112660606699
1.9111482244235287
2.047139648546421
1.7877037395256263
0.9395705378706236
0.8104819438681841
0.23228249415359528
-0.19039351214257635
-0.7615198342622864
-2.9874244809861623
-1.2452709118902578
-1.860363080372166
-0.2217131140731851
-2.431454239837477
0.3284059492499239
-3.324261863720352
-3.4912421006137606
1.0274870420688103
1.5205779945849656
-0.1898458099474169
0.28048265745765094
-3.507921457858457
0.43585634450640215
-2.81621507140744
-3.847991453791819
-4.426173468936099
1.166498991438982
-0.3617688172632995
-4.29499378268764
-4.199652569534151
-3.8157108780827955
2.088260347980179
-3.50973274364989
-2.152876448063256
-3.1339195003938856
-2.8339840744429794
-2.6124316462363297
-0.2806395094668119
-0.8454706703644661
-0.15077523711757407
-2.1052510441858656
-2.366475938293621
-3.5062008893154255
-1.090645705733549
-0.8379550685872619
-0.6385470998552012
-0.5226057567285511
-0.4762392914183511
-0.7099174559097594
-1.4069454598117261
0.0668832764226022
0.00343744

KeyboardInterrupt: 

In [None]:
env.close()

In [12]:
# This wasn't enough time apparently to train the agent! Let's try 100000 steps

import gym
from stable_baselines3 import A2C

env = gym.make('LunarLander-v2')  # continuous: LunarLanderContinuous-v2
env.reset()

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=100000)       # 

episodes = 5

for ep in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        print(rewards)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  if not isinstance(terminated, (bool, np.bool8)):


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 106      |
|    ep_rew_mean        | -269     |
| time/                 |          |
|    fps                | 458      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.628   |
|    explained_variance | -0.00133 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -8.51    |
|    value_loss         | 328      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 110      |
|    ep_rew_mean        | -368     |
| time/                 |          |
|    fps                | 451      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|

KeyboardInterrupt: 

In [None]:
env.close()

In [None]:
# Using a different algorithm
import gym
from stable_baselines3 import PPO

env = gym.make('LunarLander-v2')  # continuous: LunarLanderContinuous-v2
env.reset()

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=100000)       # 

episodes = 5

for ep in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        print(rewards)


In [None]:
env.close()

In [None]:
# Saving and loading models 

### When we trained for 10000 steps, and decided we wanted to try 100000 steps, we had to start all over. If we want to try 1,000,000 steps, we'll also need to start over. 
### It makes a lot more sense to save models along the way, and  to probably just train until you're happy with the model or want to change it in some way, which will require starting over. 

* With this in mind, how might we save and load models?

In [None]:
import gym
from stable_baselines3 import PPO
import os


models_dir = "models/PPO"   # Directory of models for PPO
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

env = gym.make('LunarLander-v2')
env.reset()

model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 10000
iters = 0
for i in range(30):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="PPO")
    # Note the reset_num_timesteps=False. This allows us to see the actual total number of timesteps for the model 
    #rather than resetting every iteration. We're also setting a constant for however many timesteps we want to do 
    #per iteration.
    model.save(f"{models_dir}/{TIMESTEPS*i}")

In [None]:
# Start tensorboard at :  http://localhost:6006/


import gym
from stable_baselines3 import A2C
import os


models_dir = "models/A2C"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

env = gym.make('LunarLander-v2')
env.reset()

model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

TIMESTEPS = 10000
iters = 0
for i in range(30):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name="A2C")
    model.save(f"{models_dir}/{TIMESTEPS*i}")

### Convenient the comparison using tensorboard