Support custom recurrent policies (#244)

* Make PPO2 support custom recurrent policies * BasePolicy: make initial_state a property * Use stateful class attribute rather than issubclass check * Introduce StatefulActorCriticPolicy to standardize masks_ph/states_ph * Dead import removal * Make placeholders and tensors publicly exposed properties * docs * DQN bugfix: don't set initial_state property * Make BasePolicy use abstractmethods * Update policies.py * Rename masks_ph -> dones_ph * Support >1D states * Bugfix: state shape * Unit test to verify LSTM policy training works in environment requiring memory * Linting * Use tuples for shapes * Merge conftest * Clean-up test cases * Docs improvements * Update docs/misc/changelog.rst Co-Authored-By: AdamGleave <adam@gleave.me> * Update tests/test_lstm_policy.py Co-Authored-By: AdamGleave <adam@gleave.me> * Rename stateful to recurrent * Doc improvements by araffin * Update tests/test_lstm_policy.py Co-Authored-By: AdamGleave <adam@gleave.me> * Avoid Codacy linting error * Add missing abstract method Trying to fix codacy warning
Stable-Baselines-Team · Apr 30, 2019 · baa4aa5 · baa4aa5
1 parent 0eac3f5
commit baa4aa5
Show file tree

Hide file tree

Showing 14 changed files with 294 additions and 90 deletions.
diff --git a/conftest.py b/conftest.py
@@ -1,14 +1,22 @@
+"""Configures pytest to ignore certain unit tests unless the appropriate flag is passed.
+
+--rungpu: tests that require GPU.
+--expensive: tests that take a long time to run (e.g. training an RL algorithm for many timestesps)."""
+
 import pytest
 
 
 def pytest_addoption(parser):
     parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests")
+    parser.addoption("--expensive", action="store_true",
+                     help="run expensive tests (which are otherwise skipped).")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--rungpu"):
-        return
-    skip_gpu = pytest.mark.skip(reason="need --rungpu option to run")
+    flags = {'gpu': '--rungpu', 'expensive': '--expensive'}
+    skips = {keyword: pytest.mark.skip(reason="need {} option to run".format(flag))
+             for keyword, flag in flags.items() if not config.getoption(flag)}
     for item in items:
-        if "gpu" in item.keywords:
-            item.add_marker(skip_gpu)
+        for keyword, skip in skips.items():
+            if keyword in item.keywords:
+                item.add_marker(skip)
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -22,6 +22,8 @@ Pre-Release 2.5.1a0 (WIP)
   ``gail.dataset.record_expert`` supports returning in-memory rather than saving to file.
 - fixed bug where result plotter would crash on very short runs (@Pastafarianist)
 - added option to not trim output of result plotter by number of timesteps (@Pastafarianist)
+- clarified the public interface of ``BasePolicy`` and ``ActorCriticPolicy``. **Breaking change** when using custom policies: ``masks_ph`` is now called ``dones_ph``.
+- support for custom stateful policies.
 
 
 Release 2.5.0 (2019-03-28)

diff --git a/docs/modules/policies.rst b/docs/modules/policies.rst
@@ -35,6 +35,9 @@ If you need more control on the policy architecture, you can also create a custo
 Base Classes
 ------------
 
+.. autoclass:: BasePolicy
+  :members:
+
 .. autoclass:: ActorCriticPolicy
   :members:
 

diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py
@@ -7,7 +7,7 @@
 
 from stable_baselines import logger
 from stable_baselines.common import explained_variance, tf_util, ActorCriticRLModel, SetVerbosity, TensorboardWriter
-from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
+from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy
 from stable_baselines.common.runners import AbstractEnvRunner
 from stable_baselines.a2c.utils import discount_with_dones, Scheduler, find_trainable_variables, mse, \
     total_episode_reward_logger
@@ -105,7 +105,7 @@ def setup_model(self):
 
                 n_batch_step = None
                 n_batch_train = None
-                if issubclass(self.policy, LstmPolicy):
+                if issubclass(self.policy, RecurrentActorCriticPolicy):
                     n_batch_step = self.n_envs
                     n_batch_train = self.n_envs * self.n_steps
 
@@ -126,7 +126,7 @@ def setup_model(self):
                     neglogpac = train_model.proba_distribution.neglogp(self.actions_ph)
                     self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy())
                     self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
-                    self.vf_loss = mse(tf.squeeze(train_model._value), self.rewards_ph)
+                    self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph)
                     # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
                     # and https://github.com/dennybritz/reinforcement-learning/issues/34
                     # suggest to add an entropy component in order to improve exploration.
@@ -194,7 +194,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
                   self.rewards_ph: rewards, self.learning_rate_ph: cur_lr}
         if states is not None:
             td_map[self.train_model.states_ph] = states
-            td_map[self.train_model.masks_ph] = masks
+            td_map[self.train_model.dones_ph] = masks
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)

diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
@@ -10,7 +10,7 @@
 from stable_baselines.acer.buffer import Buffer
 from stable_baselines.common import ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter
 from stable_baselines.common.runners import AbstractEnvRunner
-from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
+from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy
 
 
 def strip(var, n_envs, n_steps, flat=False):
@@ -187,7 +187,7 @@ def setup_model(self):
                 self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph)
 
                 n_batch_step = None
-                if issubclass(self.policy, LstmPolicy):
+                if issubclass(self.policy, RecurrentActorCriticPolicy):
                     n_batch_step = self.n_envs
                 n_batch_train = self.n_envs * (self.n_steps + 1)
 
@@ -229,7 +229,7 @@ def custom_getter(getter, name, *args, **kwargs):
                     # (var)_i = variable index by action at step i
                     # shape is [n_envs * (n_steps + 1)]
                     if continuous:
-                        value = train_model.value_fn[:, 0]
+                        value = train_model.value_flat
                     else:
                         value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1)
 
@@ -436,9 +436,9 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w
 
         if states is not None:
             td_map[self.train_model.states_ph] = states
-            td_map[self.train_model.masks_ph] = masks
+            td_map[self.train_model.dones_ph] = masks
             td_map[self.polyak_model.states_ph] = states
-            td_map[self.polyak_model.masks_ph] = masks
+            td_map[self.polyak_model.dones_ph] = masks
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)

diff --git a/stable_baselines/acktr/acktr_disc.py b/stable_baselines/acktr/acktr_disc.py
@@ -15,7 +15,7 @@
 from stable_baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse, \
     total_episode_reward_logger
 from stable_baselines.acktr import kfac
-from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
+from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy
 from stable_baselines.ppo2.ppo2 import safe_mean
 
 
@@ -123,7 +123,7 @@ def setup_model(self):
 
                 n_batch_step = None
                 n_batch_train = None
-                if issubclass(self.policy, LstmPolicy):
+                if issubclass(self.policy, RecurrentActorCriticPolicy):
                     n_batch_step = self.n_envs
                     n_batch_train = self.n_envs * self.n_steps
 
@@ -229,7 +229,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
                   self.pg_lr_ph: cur_lr}
         if states is not None:
             td_map[self.train_model.states_ph] = states
-            td_map[self.train_model.masks_ph] = masks
+            td_map[self.train_model.dones_ph] = masks
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)

diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py
@@ -9,7 +9,7 @@
 import tensorflow as tf
 
 from stable_baselines.common import set_global_seeds
-from stable_baselines.common.policies import LstmPolicy, get_policy_from_name, ActorCriticPolicy
+from stable_baselines.common.policies import get_policy_from_name, ActorCriticPolicy
 from stable_baselines.common.vec_env import VecEnvWrapper, VecEnv, DummyVecEnv
 from stable_baselines import logger
 
@@ -98,7 +98,7 @@ def set_env(self, env):
             assert isinstance(env, VecEnv), \
                 "Error: the environment passed is not a vectorized environment, however {} requires it".format(
                     self.__class__.__name__)
-            assert not issubclass(self.policy, LstmPolicy) or self.n_envs == env.num_envs, \
+            assert not self.policy.recurrent or self.n_envs == env.num_envs, \
                 "Error: the environment passed must have the same number of environments as the model was trained on." \
                 "This is due to the Lstm policy not being capable of changing the number of environments."
             self.n_envs = env.num_envs