Skip to content

Commit

Permalink
Fix action scaling for warmup exploration (SAC/DDPG/TD3) (#584)
Browse files Browse the repository at this point in the history
* Adding action scaling to and from tanh co-domain as a generic utility.

* Formating

* Adding action squashing to tanh co-domain for DDPG, TD3 and SAC whenever sampled at random from action_space.

* Unifying other instances of action scaling withing SAC, TD3 and DDPG. Adding a test.

* Adding info on fix to changelog.

* Flipping action scaling/unscaling due to confusion by parameter naming. Adding test checking involved algorithms.

* Changing names of local variables for actions, in order to follow naming of used action scaling methods.

* Adding check on scaling inferred actions as well

* Considering learning_starts parameter of SAC and TD3 when checking action scaling.

* Removing misclick addition

* Adding to changelog

* Adding nick to bugfix.

* Removing asserts enforcing symmetric action space (DDPG, TD3, SAC).

* Changelog: non-symmetric action spaces info.

* Test Action Scaling: remove unnecessary wrapping of environment, make action space asymmetric.

* Adding comments

* Missing line break

* Removing unused import.
  • Loading branch information
Antymon authored and araffin committed Nov 26, 2019
1 parent e315ebe commit 05c5717
Show file tree
Hide file tree
Showing 10 changed files with 174 additions and 30 deletions.
2 changes: 2 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ New Features:
- Add parameter `exploration_initial_eps` to DQN. (@jdossgollin)
- Add type checking and PEP 561 compliance.
Note: most functions are still not annotated, this will be a gradual process.
- DDPG, TD3 and SAC accept non-symmetric action spaces. (@Antymon)

Bug Fixes:
^^^^^^^^^^
- Fix seeding, so it is now possible to have deterministic results on cpu
- Fix a bug in DDPG where `predict` method with `deterministic=False` would fail
- Fix a bug in TRPO: mean_losses was not initialized causing the logger to crash when there was no gradients (@MarvineGothic)
- Fix a bug in `cmd_util` from API change in recent Gym versions
- Fix a bug in DDPG, TD3 and SAC where warmup and random exploration actions would end up scaled in the replay buffer (@Antymon)

Deprecations:
^^^^^^^^^^^^^
Expand Down
26 changes: 26 additions & 0 deletions stable_baselines/common/math_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,29 @@ def discount_with_boundaries(rewards, episode_starts, gamma):
for step in range(n_samples - 2, -1, -1):
discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1])
return discounted_rewards


def scale_action(action_space, action):
"""
Rescale the action from [low, high] to [-1, 1]
(no need for symmetric action space)
:param action_space: (gym.spaces.box.Box)
:param action: (np.ndarray)
:return: (np.ndarray)
"""
low, high = action_space.low, action_space.high
return 2.0 * ((action - low) / (high - low)) - 1.0


def unscale_action(action_space, scaled_action):
"""
Rescale the action from [-1, 1] to [low, high]
(no need for symmetric action space)
:param action_space: (gym.spaces.box.Box)
:param action: (np.ndarray)
:return: (np.ndarray)
"""
low, high = action_space.low, action_space.high
return low + (0.5 * (scaled_action + 1.0) * (high - low))
22 changes: 13 additions & 9 deletions stable_baselines/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.vec_env import VecEnv
from stable_baselines.common.mpi_adam import MpiAdam
from stable_baselines.common.math_util import unscale_action, scale_action
from stable_baselines.ddpg.policies import DDPGPolicy
from stable_baselines.common.mpi_running_mean_std import RunningMeanStd
from stable_baselines.a2c.utils import total_episode_reward_logger
Expand Down Expand Up @@ -312,7 +313,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
def _get_pretrain_placeholders(self):
policy = self.policy_tf
# Rescale
deterministic_action = self.actor_tf * np.abs(self.action_space.low)
deterministic_action = unscale_action(self.action_space, self.actor_tf)
return policy.obs_ph, self.actions, deterministic_action

def setup_model(self):
Expand Down Expand Up @@ -818,8 +819,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
self.tb_seen_steps = []

rank = MPI.COMM_WORLD.Get_rank()
# we assume symmetric actions.
assert np.all(np.abs(self.env.action_space.low) == self.env.action_space.high)

if self.verbose >= 2:
logger.log('Using agent with the following configuration:')
logger.log(str(self.__dict__.items()))
Expand Down Expand Up @@ -872,11 +872,15 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
# Randomly sample actions from a uniform distribution
# with a probabilty self.random_exploration (used in HER + DDPG)
if np.random.rand() < self.random_exploration:
rescaled_action = action = self.action_space.sample()
# actions sampled from action space are from range specific to the environment
# but algorithm operates on tanh-squashed actions therefore simple scaling is used
unscaled_action = self.action_space.sample()
action = scale_action(self.action_space, unscaled_action)
else:
rescaled_action = action * np.abs(self.action_space.low)
# inferred actions need to be transformed to environment action_space before stepping
unscaled_action = unscale_action(self.action_space, action)

new_obs, reward, done, info = self.env.step(rescaled_action)
new_obs, reward, done, info = self.env.step(unscaled_action)

if writer is not None:
ep_rew = np.array([reward]).reshape((1, -1))
Expand Down Expand Up @@ -955,8 +959,8 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
return self

eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True)
eval_obs, eval_r, eval_done, _ = self.eval_env.step(eval_action *
np.abs(self.action_space.low))
unscaled_action = unscale_action(self.action_space, eval_action)
eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action)
if self.render_eval:
self.eval_env.render()
eval_episode_reward += eval_r
Expand Down Expand Up @@ -1041,7 +1045,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
observation = observation.reshape((-1,) + self.observation_space.shape)
actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False)
actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape
actions = actions * np.abs(self.action_space.low) # scale the output for the prediction
actions = unscale_action(self.action_space, actions) # scale the output for the prediction

if not vectorized_env:
actions = actions[0]
Expand Down
1 change: 0 additions & 1 deletion stable_baselines/ddpg/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=Fals
super(DDPGPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale,
add_action_ph=True)
assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"
self.qvalue_fn = None
self.policy = None

Expand Down
1 change: 0 additions & 1 deletion stable_baselines/sac/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class SACPolicy(BasePolicy):
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False):
super(SACPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale)
assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"

self.qf1 = None
self.qf2 = None
Expand Down
20 changes: 11 additions & 9 deletions stable_baselines/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from stable_baselines.a2c.utils import total_episode_reward_logger
from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.vec_env import VecEnv
from stable_baselines.common.math_util import unscale_action, scale_action
from stable_baselines.deepq.replay_buffer import ReplayBuffer
from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn
from stable_baselines.sac.policies import SACPolicy
Expand Down Expand Up @@ -139,7 +140,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
def _get_pretrain_placeholders(self):
policy = self.policy_tf
# Rescale
deterministic_action = self.deterministic_action * np.abs(self.action_space.low)
deterministic_action = unscale_action(self.action_space, self.deterministic_action)
return policy.obs_ph, self.actions_ph, deterministic_action

def setup_model(self):
Expand Down Expand Up @@ -405,22 +406,23 @@ def learn(self, total_timesteps, callback=None,
# from a uniform distribution for better exploration.
# Afterwards, use the learned policy
# if random_exploration is set to 0 (normal setting)
if (self.num_timesteps < self.learning_starts
or np.random.rand() < self.random_exploration):
# No need to rescale when sampling random action
rescaled_action = action = self.env.action_space.sample()
if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
# actions sampled from action space are from range specific to the environment
# but algorithm operates on tanh-squashed actions therefore simple scaling is used
unscaled_action = self.env.action_space.sample()
action = scale_action(self.action_space, unscaled_action)
else:
action = self.policy_tf.step(obs[None], deterministic=False).flatten()
# Add noise to the action (improve exploration,
# not needed in general)
if self.action_noise is not None:
action = np.clip(action + self.action_noise(), -1, 1)
# Rescale from [-1, 1] to the correct bounds
rescaled_action = action * np.abs(self.action_space.low)
# inferred actions need to be transformed to environment action_space before stepping
unscaled_action = unscale_action(self.action_space, action)

assert action.shape == self.env.action_space.shape

new_obs, reward, done, info = self.env.step(rescaled_action)
new_obs, reward, done, info = self.env.step(unscaled_action)

# Store transition in the replay buffer.
self.replay_buffer.add(obs, action, reward, new_obs, float(done))
Expand Down Expand Up @@ -519,7 +521,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
observation = observation.reshape((-1,) + self.observation_space.shape)
actions = self.policy_tf.step(observation, deterministic=deterministic)
actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape
actions = actions * np.abs(self.action_space.low) # scale the output for the prediction
actions = unscale_action(self.action_space, actions) # scale the output for the prediction

if not vectorized_env:
actions = actions[0]
Expand Down
1 change: 0 additions & 1 deletion stable_baselines/td3/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class TD3Policy(BasePolicy):
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False):
super(TD3Policy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale)
assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"

self.qf1 = None
self.qf2 = None
Expand Down
18 changes: 10 additions & 8 deletions stable_baselines/td3/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from stable_baselines.a2c.utils import total_episode_reward_logger
from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.vec_env import VecEnv
from stable_baselines.common.math_util import unscale_action, scale_action
from stable_baselines.deepq.replay_buffer import ReplayBuffer
from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn
from stable_baselines.sac.sac import get_vars
Expand Down Expand Up @@ -120,7 +121,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
def _get_pretrain_placeholders(self):
policy = self.policy_tf
# Rescale
policy_out = self.policy_out * np.abs(self.action_space.low)
policy_out = unscale_action(self.action_space, self.policy_out)
return policy.obs_ph, self.actions_ph, policy_out

def setup_model(self):
Expand Down Expand Up @@ -316,22 +317,23 @@ def learn(self, total_timesteps, callback=None,
# from a uniform distribution for better exploration.
# Afterwards, use the learned policy
# if random_exploration is set to 0 (normal setting)
if (self.num_timesteps < self.learning_starts
or np.random.rand() < self.random_exploration):
# No need to rescale when sampling random action
rescaled_action = action = self.env.action_space.sample()
if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
# actions sampled from action space are from range specific to the environment
# but algorithm operates on tanh-squashed actions therefore simple scaling is used
unscaled_action = self.env.action_space.sample()
action = scale_action(self.action_space, unscaled_action)
else:
action = self.policy_tf.step(obs[None]).flatten()
# Add noise to the action, as the policy
# is deterministic, this is required for exploration
if self.action_noise is not None:
action = np.clip(action + self.action_noise(), -1, 1)
# Rescale from [-1, 1] to the correct bounds
rescaled_action = action * np.abs(self.action_space.low)
unscaled_action = unscale_action(self.action_space, action)

assert action.shape == self.env.action_space.shape

new_obs, reward, done, info = self.env.step(rescaled_action)
new_obs, reward, done, info = self.env.step(unscaled_action)

# Store transition in the replay buffer.
self.replay_buffer.add(obs, action, reward, new_obs, float(done))
Expand Down Expand Up @@ -435,7 +437,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
actions = np.clip(actions + self.action_noise(), -1, 1)

actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape
actions = actions * np.abs(self.action_space.low) # scale the output for the prediction
actions = unscale_action(self.action_space, actions) # scale the output for the prediction

if not vectorized_env:
actions = actions[0]
Expand Down
45 changes: 45 additions & 0 deletions tests/test_action_scaling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pytest
import numpy as np

from stable_baselines import DDPG, TD3, SAC
from stable_baselines.common.identity_env import IdentityEnvBox

ROLLOUT_STEPS = 100

MODEL_LIST = [
(DDPG, dict(nb_train_steps=0, nb_rollout_steps=ROLLOUT_STEPS)),
(TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)),
(SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)),
(TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)),
(SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS))
]


@pytest.mark.parametrize("model_class, model_kwargs", MODEL_LIST)
def test_buffer_actions_scaling(model_class, model_kwargs):
"""
Test if actions are scaled to tanh co-domain before being put in a buffer
for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC
:param model_class: (BaseRLModel) A RL Model
:param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm
"""

# check random and inferred actions as they possibly have different flows
for random_coeff in [0.0, 1.0]:

env = IdentityEnvBox(-2000, 1000)

model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs)
model.learn(total_timesteps=ROLLOUT_STEPS)

assert hasattr(model, 'replay_buffer')

buffer = model.replay_buffer

assert buffer.can_sample(ROLLOUT_STEPS)

_, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS)

assert not np.any(actions > np.ones_like(actions))
assert not np.any(actions < -np.ones_like(actions))
68 changes: 67 additions & 1 deletion tests/test_math_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import tensorflow as tf
import numpy as np
from gym.spaces.box import Box

from stable_baselines.common.math_util import discount_with_boundaries
from stable_baselines.common.math_util import discount_with_boundaries, scale_action, unscale_action


def test_discount_with_boundaries():
Expand All @@ -13,3 +15,67 @@ def test_discount_with_boundaries():
discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma)
assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4])
return


def test_scaling_action():
"""
test scaling of scalar, 1d and 2d vectors of finite non-NaN real numbers to and from tanh co-domain (per component)
"""
test_ranges = [(-1, 1), (-10, 10), (-10, 5), (-10, 0), (-10, -5), (0, 10), (5, 10)]

# scalars
for (range_low, range_high) in test_ranges:
check_scaled_actions_from_range(range_low, range_high, scalar=True)

# 1d vectors: wrapped scalars
for test_range in test_ranges:
check_scaled_actions_from_range(*test_range)

# 2d vectors: all combinations of ranges above
for (r1_low, r1_high) in test_ranges:
for (r2_low, r2_high) in test_ranges:
check_scaled_actions_from_range(np.array([r1_low, r2_low], dtype=np.float),
np.array([r1_high, r2_high], dtype=np.float))


def check_scaled_actions_from_range(low, high, scalar=False):
"""
helper method which creates dummy action space spanning between respective components of low and high
and then checks scaling to and from tanh co-domain for low, middle and high value from that action space
:param low: (np.ndarray), (int) or (float)
:param high: (np.ndarray), (int) or (float)
:param scalar: (bool) Whether consider scalar range or wrap it into 1d vector
"""

if scalar and (isinstance(low, float) or isinstance(low, int)):
ones = 1.
action_space = Box(low, high, shape=(1,))
else:
low = np.atleast_1d(low)
high = np.atleast_1d(high)
ones = np.ones_like(low)
action_space = Box(low, high)

mid = 0.5 * (low + high)

expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)]

for (not_scaled, scaled) in expected_mapping:
assert np.allclose(scale_action(action_space, not_scaled), scaled)
assert np.allclose(unscale_action(action_space, scaled), not_scaled)


def test_batch_shape_invariant_to_scaling():
"""
test that scaling deals well with batches as tensors and numpy matrices in terms of shape
"""
action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.]))

tensor = tf.constant(1., shape=[2, 3])
matrix = np.ones((2, 3))

assert scale_action(action_space, tensor).shape == (2, 3)
assert scale_action(action_space, matrix).shape == (2, 3)

assert unscale_action(action_space, tensor).shape == (2, 3)
assert unscale_action(action_space, matrix).shape == (2, 3)

0 comments on commit 05c5717

Please sign in to comment.