Fix action scaling for warmup exploration (SAC/DDPG/TD3) (#584)

* Adding action scaling to and from tanh co-domain as a generic utility. * Formating * Adding action squashing to tanh co-domain for DDPG, TD3 and SAC whenever sampled at random from action_space. * Unifying other instances of action scaling withing SAC, TD3 and DDPG. Adding a test. * Adding info on fix to changelog. * Flipping action scaling/unscaling due to confusion by parameter naming. Adding test checking involved algorithms. * Changing names of local variables for actions, in order to follow naming of used action scaling methods. * Adding check on scaling inferred actions as well * Considering learning_starts parameter of SAC and TD3 when checking action scaling. * Removing misclick addition * Adding to changelog * Adding nick to bugfix. * Removing asserts enforcing symmetric action space (DDPG, TD3, SAC). * Changelog: non-symmetric action spaces info. * Test Action Scaling: remove unnecessary wrapping of environment, make action space asymmetric. * Adding comments * Missing line break * Removing unused import.
Stable-Baselines-Team · Nov 26, 2019 · 05c5717 · 05c5717
1 parent e315ebe
commit 05c5717
Show file tree

Hide file tree

Showing 10 changed files with 174 additions and 30 deletions.
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -27,13 +27,15 @@ New Features:
 - Add parameter `exploration_initial_eps` to DQN. (@jdossgollin)
 - Add type checking and PEP 561 compliance.
   Note: most functions are still not annotated, this will be a gradual process.
+- DDPG, TD3 and SAC accept non-symmetric action spaces. (@Antymon)
 
 Bug Fixes:
 ^^^^^^^^^^
 - Fix seeding, so it is now possible to have deterministic results on cpu
 - Fix a bug in DDPG where `predict` method with `deterministic=False` would fail
 - Fix a bug in TRPO: mean_losses was not initialized causing the logger to crash when there was no gradients (@MarvineGothic)
 - Fix a bug in `cmd_util` from API change in recent Gym versions
+- Fix a bug in DDPG, TD3 and SAC where warmup and random exploration actions would end up scaled in the replay buffer (@Antymon)
 
 Deprecations:
 ^^^^^^^^^^^^^

diff --git a/stable_baselines/common/math_util.py b/stable_baselines/common/math_util.py
@@ -101,3 +101,29 @@ def discount_with_boundaries(rewards, episode_starts, gamma):
     for step in range(n_samples - 2, -1, -1):
         discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1])
     return discounted_rewards
+
+
+def scale_action(action_space, action):
+    """
+    Rescale the action from [low, high] to [-1, 1]
+    (no need for symmetric action space)
+
+    :param action_space: (gym.spaces.box.Box)
+    :param action: (np.ndarray)
+    :return: (np.ndarray)
+    """
+    low, high = action_space.low, action_space.high
+    return 2.0 * ((action - low) / (high - low)) - 1.0
+
+
+def unscale_action(action_space, scaled_action):
+    """
+    Rescale the action from [-1, 1] to [low, high]
+    (no need for symmetric action space)
+
+    :param action_space: (gym.spaces.box.Box)
+    :param action: (np.ndarray)
+    :return: (np.ndarray)
+    """
+    low, high = action_space.low, action_space.high
+    return low + (0.5 * (scaled_action + 1.0) * (high - low))
diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py
@@ -15,6 +15,7 @@
 from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
 from stable_baselines.common.vec_env import VecEnv
 from stable_baselines.common.mpi_adam import MpiAdam
+from stable_baselines.common.math_util import unscale_action, scale_action
 from stable_baselines.ddpg.policies import DDPGPolicy
 from stable_baselines.common.mpi_running_mean_std import RunningMeanStd
 from stable_baselines.a2c.utils import total_episode_reward_logger
@@ -312,7 +313,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
     def _get_pretrain_placeholders(self):
         policy = self.policy_tf
         # Rescale
-        deterministic_action = self.actor_tf * np.abs(self.action_space.low)
+        deterministic_action = unscale_action(self.action_space, self.actor_tf)
         return policy.obs_ph, self.actions, deterministic_action
 
     def setup_model(self):
@@ -818,8 +819,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
             self.tb_seen_steps = []
 
             rank = MPI.COMM_WORLD.Get_rank()
-            # we assume symmetric actions.
-            assert np.all(np.abs(self.env.action_space.low) == self.env.action_space.high)
+
             if self.verbose >= 2:
                 logger.log('Using agent with the following configuration:')
                 logger.log(str(self.__dict__.items()))
@@ -872,11 +872,15 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
                             # Randomly sample actions from a uniform distribution
                             # with a probabilty self.random_exploration (used in HER + DDPG)
                             if np.random.rand() < self.random_exploration:
-                                rescaled_action = action = self.action_space.sample()
+                                # actions sampled from action space are from range specific to the environment
+                                # but algorithm operates on tanh-squashed actions therefore simple scaling is used
+                                unscaled_action = self.action_space.sample()
+                                action = scale_action(self.action_space, unscaled_action)
                             else:
-                                rescaled_action = action * np.abs(self.action_space.low)
+                                # inferred actions need to be transformed to environment action_space before stepping
+                                unscaled_action = unscale_action(self.action_space, action)
 
-                            new_obs, reward, done, info = self.env.step(rescaled_action)
+                            new_obs, reward, done, info = self.env.step(unscaled_action)
 
                             if writer is not None:
                                 ep_rew = np.array([reward]).reshape((1, -1))
@@ -955,8 +959,8 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
                                     return self
 
                                 eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True)
-                                eval_obs, eval_r, eval_done, _ = self.eval_env.step(eval_action *
-                                                                                    np.abs(self.action_space.low))
+                                unscaled_action = unscale_action(self.action_space, eval_action)
+                                eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action)
                                 if self.render_eval:
                                     self.eval_env.render()
                                 eval_episode_reward += eval_r
@@ -1041,7 +1045,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
         observation = observation.reshape((-1,) + self.observation_space.shape)
         actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False)
         actions = actions.reshape((-1,) + self.action_space.shape)  # reshape to the correct action shape
-        actions = actions * np.abs(self.action_space.low)  # scale the output for the prediction
+        actions = unscale_action(self.action_space, actions)  # scale the output for the prediction
 
         if not vectorized_env:
             actions = actions[0]

diff --git a/stable_baselines/ddpg/policies.py b/stable_baselines/ddpg/policies.py
@@ -23,7 +23,6 @@ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=Fals
         super(DDPGPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale,
                                          add_action_ph=True)
         assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
-        assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"
         self.qvalue_fn = None
         self.policy = None
 

diff --git a/stable_baselines/sac/policies.py b/stable_baselines/sac/policies.py
@@ -99,7 +99,6 @@ class SACPolicy(BasePolicy):
     def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False):
         super(SACPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale)
         assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
-        assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"
 
         self.qf1 = None
         self.qf2 = None

diff --git a/stable_baselines/sac/sac.py b/stable_baselines/sac/sac.py
@@ -9,6 +9,7 @@
 from stable_baselines.a2c.utils import total_episode_reward_logger
 from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
 from stable_baselines.common.vec_env import VecEnv
+from stable_baselines.common.math_util import unscale_action, scale_action
 from stable_baselines.deepq.replay_buffer import ReplayBuffer
 from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn
 from stable_baselines.sac.policies import SACPolicy
@@ -139,7 +140,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
     def _get_pretrain_placeholders(self):
         policy = self.policy_tf
         # Rescale
-        deterministic_action = self.deterministic_action * np.abs(self.action_space.low)
+        deterministic_action = unscale_action(self.action_space, self.deterministic_action)
         return policy.obs_ph, self.actions_ph, deterministic_action
 
     def setup_model(self):
@@ -405,22 +406,23 @@ def learn(self, total_timesteps, callback=None,
                 # from a uniform distribution for better exploration.
                 # Afterwards, use the learned policy
                 # if random_exploration is set to 0 (normal setting)
-                if (self.num_timesteps < self.learning_starts
-                    or np.random.rand() < self.random_exploration):
-                    # No need to rescale when sampling random action
-                    rescaled_action = action = self.env.action_space.sample()
+                if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
+                    # actions sampled from action space are from range specific to the environment
+                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
+                    unscaled_action = self.env.action_space.sample()
+                    action = scale_action(self.action_space, unscaled_action)
                 else:
                     action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                     # Add noise to the action (improve exploration,
                     # not needed in general)
                     if self.action_noise is not None:
                         action = np.clip(action + self.action_noise(), -1, 1)
-                    # Rescale from [-1, 1] to the correct bounds
-                    rescaled_action = action * np.abs(self.action_space.low)
+                    # inferred actions need to be transformed to environment action_space before stepping
+                    unscaled_action = unscale_action(self.action_space, action)
 
                 assert action.shape == self.env.action_space.shape
 
-                new_obs, reward, done, info = self.env.step(rescaled_action)
+                new_obs, reward, done, info = self.env.step(unscaled_action)
 
                 # Store transition in the replay buffer.
                 self.replay_buffer.add(obs, action, reward, new_obs, float(done))
@@ -519,7 +521,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
         observation = observation.reshape((-1,) + self.observation_space.shape)
         actions = self.policy_tf.step(observation, deterministic=deterministic)
         actions = actions.reshape((-1,) + self.action_space.shape)  # reshape to the correct action shape
-        actions = actions * np.abs(self.action_space.low)  # scale the output for the prediction
+        actions = unscale_action(self.action_space, actions) # scale the output for the prediction
 
         if not vectorized_env:
             actions = actions[0]

diff --git a/stable_baselines/td3/policies.py b/stable_baselines/td3/policies.py
@@ -23,7 +23,6 @@ class TD3Policy(BasePolicy):
     def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False):
         super(TD3Policy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale)
         assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
-        assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"
 
         self.qf1 = None
         self.qf2 = None

diff --git a/stable_baselines/td3/td3.py b/stable_baselines/td3/td3.py
@@ -9,6 +9,7 @@
 from stable_baselines.a2c.utils import total_episode_reward_logger
 from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter
 from stable_baselines.common.vec_env import VecEnv
+from stable_baselines.common.math_util import unscale_action, scale_action
 from stable_baselines.deepq.replay_buffer import ReplayBuffer
 from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn
 from stable_baselines.sac.sac import get_vars
@@ -120,7 +121,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
     def _get_pretrain_placeholders(self):
         policy = self.policy_tf
         # Rescale
-        policy_out = self.policy_out * np.abs(self.action_space.low)
+        policy_out = unscale_action(self.action_space, self.policy_out)
         return policy.obs_ph, self.actions_ph, policy_out
 
     def setup_model(self):
@@ -316,22 +317,23 @@ def learn(self, total_timesteps, callback=None,
                 # from a uniform distribution for better exploration.
                 # Afterwards, use the learned policy
                 # if random_exploration is set to 0 (normal setting)
-                if (self.num_timesteps < self.learning_starts
-                        or np.random.rand() < self.random_exploration):
-                    # No need to rescale when sampling random action
-                    rescaled_action = action = self.env.action_space.sample()
+                if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
+                    # actions sampled from action space are from range specific to the environment
+                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
+                    unscaled_action = self.env.action_space.sample()
+                    action = scale_action(self.action_space, unscaled_action)
                 else:
                     action = self.policy_tf.step(obs[None]).flatten()
                     # Add noise to the action, as the policy
                     # is deterministic, this is required for exploration
                     if self.action_noise is not None:
                         action = np.clip(action + self.action_noise(), -1, 1)
                     # Rescale from [-1, 1] to the correct bounds
-                    rescaled_action = action * np.abs(self.action_space.low)
+                    unscaled_action = unscale_action(self.action_space, action)
 
                 assert action.shape == self.env.action_space.shape
 
-                new_obs, reward, done, info = self.env.step(rescaled_action)
+                new_obs, reward, done, info = self.env.step(unscaled_action)
 
                 # Store transition in the replay buffer.
                 self.replay_buffer.add(obs, action, reward, new_obs, float(done))
@@ -435,7 +437,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True):
             actions = np.clip(actions + self.action_noise(), -1, 1)
 
         actions = actions.reshape((-1,) + self.action_space.shape)  # reshape to the correct action shape
-        actions = actions * np.abs(self.action_space.low)  # scale the output for the prediction
+        actions = unscale_action(self.action_space, actions)  # scale the output for the prediction
 
         if not vectorized_env:
             actions = actions[0]

diff --git a/tests/test_action_scaling.py b/tests/test_action_scaling.py
@@ -0,0 +1,45 @@
+import pytest
+import numpy as np
+
+from stable_baselines import DDPG, TD3, SAC
+from stable_baselines.common.identity_env import IdentityEnvBox
+
+ROLLOUT_STEPS = 100
+
+MODEL_LIST = [
+    (DDPG, dict(nb_train_steps=0, nb_rollout_steps=ROLLOUT_STEPS)),
+    (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)),
+    (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)),
+    (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)),
+    (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS))
+]
+
+
+@pytest.mark.parametrize("model_class, model_kwargs", MODEL_LIST)
+def test_buffer_actions_scaling(model_class, model_kwargs):
+    """
+    Test if actions are scaled to tanh co-domain before being put in a buffer
+    for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC
+
+    :param model_class: (BaseRLModel) A RL Model
+    :param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm
+    """
+
+    # check random and inferred actions as they possibly have different flows
+    for random_coeff in [0.0, 1.0]:
+
+        env = IdentityEnvBox(-2000, 1000)
+
+        model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs)
+        model.learn(total_timesteps=ROLLOUT_STEPS)
+
+        assert hasattr(model, 'replay_buffer')
+
+        buffer = model.replay_buffer
+
+        assert buffer.can_sample(ROLLOUT_STEPS)
+
+        _, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS)
+
+        assert not np.any(actions > np.ones_like(actions))
+        assert not np.any(actions < -np.ones_like(actions))
diff --git a/tests/test_math_util.py b/tests/test_math_util.py
@@ -1,6 +1,8 @@
+import tensorflow as tf
 import numpy as np
+from gym.spaces.box import Box
 
-from stable_baselines.common.math_util import discount_with_boundaries
+from stable_baselines.common.math_util import discount_with_boundaries, scale_action, unscale_action
 
 
 def test_discount_with_boundaries():
@@ -13,3 +15,67 @@ def test_discount_with_boundaries():
     discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma)
     assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4])
     return
+
+
+def test_scaling_action():
+    """
+    test scaling of scalar, 1d and 2d vectors of finite non-NaN real numbers to and from tanh co-domain (per component)
+    """
+    test_ranges = [(-1, 1), (-10, 10), (-10, 5), (-10, 0), (-10, -5), (0, 10), (5, 10)]
+
+    # scalars
+    for (range_low, range_high) in test_ranges:
+        check_scaled_actions_from_range(range_low, range_high, scalar=True)
+
+    # 1d vectors: wrapped scalars
+    for test_range in test_ranges:
+        check_scaled_actions_from_range(*test_range)
+
+    # 2d vectors: all combinations of ranges above
+    for (r1_low, r1_high) in test_ranges:
+        for (r2_low, r2_high) in test_ranges:
+            check_scaled_actions_from_range(np.array([r1_low, r2_low], dtype=np.float),
+                                            np.array([r1_high, r2_high], dtype=np.float))
+
+
+def check_scaled_actions_from_range(low, high, scalar=False):
+    """
+    helper method which creates dummy action space spanning between respective components of low and high
+    and then checks scaling to and from tanh co-domain for low, middle and high value from  that action space
+    :param low: (np.ndarray), (int) or (float)
+    :param high: (np.ndarray), (int) or (float)
+    :param scalar: (bool) Whether consider scalar range or wrap it into 1d vector
+    """
+
+    if scalar and (isinstance(low, float) or isinstance(low, int)):
+        ones = 1.
+        action_space = Box(low, high, shape=(1,))
+    else:
+        low = np.atleast_1d(low)
+        high = np.atleast_1d(high)
+        ones = np.ones_like(low)
+        action_space = Box(low, high)
+
+    mid = 0.5 * (low + high)
+
+    expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)]
+
+    for (not_scaled, scaled) in expected_mapping:
+        assert np.allclose(scale_action(action_space, not_scaled), scaled)
+        assert np.allclose(unscale_action(action_space, scaled), not_scaled)
+
+
+def test_batch_shape_invariant_to_scaling():
+    """
+    test that scaling deals well with batches as tensors and numpy matrices in terms of shape
+    """
+    action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.]))
+
+    tensor = tf.constant(1., shape=[2, 3])
+    matrix = np.ones((2, 3))
+
+    assert scale_action(action_space, tensor).shape == (2, 3)
+    assert scale_action(action_space, matrix).shape == (2, 3)
+
+    assert unscale_action(action_space, tensor).shape == (2, 3)
+    assert unscale_action(action_space, matrix).shape == (2, 3)