In [9]:
#!pip install stable-baselines3[extra]
# !conda install swig # needed to build Box2D in the pip install
# !pip install box2d-py # a repackaged version of pybox2d

In [10]:
import gym
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

In [11]:
# Create environment
env = gym.make('LunarLander-v2')
print(env.action_space.sample())
print(env.observation_space.sample())

0
[ 0.1569122   0.49098596 -1.2012377  -0.00393478  1.360848   -1.1556934
  2.4045413   0.96265286]


In [12]:
# Instantiate the agent
model = DQN('MlpPolicy', env,
            verbose=1,tensorboard_log='./logs',
            learning_rate=5e-4,policy_kwargs={
                'net_arch':[256,256]
            })

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [13]:
# Train the agent
model.learn(total_timesteps=int(5e5),tb_log_name='DQN')
# Save the agent
model.save("dqn_net256_lunar")
del model  # delete trained model to demonstrate loading

Logging to ./logs\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90.2     |
|    ep_rew_mean      | -262     |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1378     |
|    time_elapsed     | 0        |
|    total_timesteps  | 361      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 104      |
|    ep_rew_mean      | -228     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1579     |
|    time_elapsed     | 0        |
|    total_timesteps  | 829      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | -233     |
|    exploration_rate | 0.977    |
| time/               |        

In [14]:
# Load the trained agent
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
model = DQN.load("dqn_net256_lunar", env=env)

# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(),deterministic=True,render=True, n_eval_episodes=10)
print(mean_reward,std_reward)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
146.2599667 101.88847134290344


In [16]:
episode = 10
# Enjoy trained agent
obs = env.reset()
for i in range(episode):
    dones = False
    rewards = 0
    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        rewards += reward
    print(rewards)

92.17756477768047
1.2028210934166144
-0.7710245432204264
-0.6132891415311608
1.1896247725657727
-0.8689089466011619
-0.8515214188520399
1.3546029603247807
-0.906917532456698
1.161606216906873
