Skip to content

Commit

Permalink
Support custom recurrent policies (#244)
Browse files Browse the repository at this point in the history
* Make PPO2 support custom recurrent policies

* BasePolicy: make initial_state a property

* Use stateful class attribute rather than issubclass check

* Introduce StatefulActorCriticPolicy to standardize masks_ph/states_ph

* Dead import removal

* Make placeholders and tensors publicly exposed properties

* docs

* DQN bugfix: don't set initial_state property

* Make BasePolicy use abstractmethods

* Update policies.py

* Rename masks_ph -> dones_ph

* Support >1D states

* Bugfix: state shape

* Unit test to verify LSTM policy training works in environment requiring memory

* Linting

* Use tuples for shapes

* Merge conftest

* Clean-up test cases

* Docs improvements

* Update docs/misc/changelog.rst

Co-Authored-By: AdamGleave <adam@gleave.me>

* Update tests/test_lstm_policy.py

Co-Authored-By: AdamGleave <adam@gleave.me>

* Rename stateful to recurrent

* Doc improvements by araffin

* Update tests/test_lstm_policy.py

Co-Authored-By: AdamGleave <adam@gleave.me>

* Avoid Codacy linting error

* Add missing abstract method

Trying to fix codacy warning
  • Loading branch information
AdamGleave authored and araffin committed Apr 30, 2019
1 parent 0eac3f5 commit baa4aa5
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 90 deletions.
18 changes: 13 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
"""Configures pytest to ignore certain unit tests unless the appropriate flag is passed.
--rungpu: tests that require GPU.
--expensive: tests that take a long time to run (e.g. training an RL algorithm for many timestesps)."""

import pytest


def pytest_addoption(parser):
parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests")
parser.addoption("--expensive", action="store_true",
help="run expensive tests (which are otherwise skipped).")


def pytest_collection_modifyitems(config, items):
if config.getoption("--rungpu"):
return
skip_gpu = pytest.mark.skip(reason="need --rungpu option to run")
flags = {'gpu': '--rungpu', 'expensive': '--expensive'}
skips = {keyword: pytest.mark.skip(reason="need {} option to run".format(flag))
for keyword, flag in flags.items() if not config.getoption(flag)}
for item in items:
if "gpu" in item.keywords:
item.add_marker(skip_gpu)
for keyword, skip in skips.items():
if keyword in item.keywords:
item.add_marker(skip)
2 changes: 2 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Pre-Release 2.5.1a0 (WIP)
``gail.dataset.record_expert`` supports returning in-memory rather than saving to file.
- fixed bug where result plotter would crash on very short runs (@Pastafarianist)
- added option to not trim output of result plotter by number of timesteps (@Pastafarianist)
- clarified the public interface of ``BasePolicy`` and ``ActorCriticPolicy``. **Breaking change** when using custom policies: ``masks_ph`` is now called ``dones_ph``.
- support for custom stateful policies.


Release 2.5.0 (2019-03-28)
Expand Down
3 changes: 3 additions & 0 deletions docs/modules/policies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ If you need more control on the policy architecture, you can also create a custo
Base Classes
------------

.. autoclass:: BasePolicy
:members:

.. autoclass:: ActorCriticPolicy
:members:

Expand Down
8 changes: 4 additions & 4 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from stable_baselines import logger
from stable_baselines.common import explained_variance, tf_util, ActorCriticRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy
from stable_baselines.common.runners import AbstractEnvRunner
from stable_baselines.a2c.utils import discount_with_dones, Scheduler, find_trainable_variables, mse, \
total_episode_reward_logger
Expand Down Expand Up @@ -105,7 +105,7 @@ def setup_model(self):

n_batch_step = None
n_batch_train = None
if issubclass(self.policy, LstmPolicy):
if issubclass(self.policy, RecurrentActorCriticPolicy):
n_batch_step = self.n_envs
n_batch_train = self.n_envs * self.n_steps

Expand All @@ -126,7 +126,7 @@ def setup_model(self):
neglogpac = train_model.proba_distribution.neglogp(self.actions_ph)
self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy())
self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
self.vf_loss = mse(tf.squeeze(train_model._value), self.rewards_ph)
self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph)
# https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
# and https://github.com/dennybritz/reinforcement-learning/issues/34
# suggest to add an entropy component in order to improve exploration.
Expand Down Expand Up @@ -194,7 +194,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
self.rewards_ph: rewards, self.learning_rate_ph: cur_lr}
if states is not None:
td_map[self.train_model.states_ph] = states
td_map[self.train_model.masks_ph] = masks
td_map[self.train_model.dones_ph] = masks

if writer is not None:
# run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
Expand Down
10 changes: 5 additions & 5 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from stable_baselines.acer.buffer import Buffer
from stable_baselines.common import ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter
from stable_baselines.common.runners import AbstractEnvRunner
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy


def strip(var, n_envs, n_steps, flat=False):
Expand Down Expand Up @@ -187,7 +187,7 @@ def setup_model(self):
self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph)

n_batch_step = None
if issubclass(self.policy, LstmPolicy):
if issubclass(self.policy, RecurrentActorCriticPolicy):
n_batch_step = self.n_envs
n_batch_train = self.n_envs * (self.n_steps + 1)

Expand Down Expand Up @@ -229,7 +229,7 @@ def custom_getter(getter, name, *args, **kwargs):
# (var)_i = variable index by action at step i
# shape is [n_envs * (n_steps + 1)]
if continuous:
value = train_model.value_fn[:, 0]
value = train_model.value_flat
else:
value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1)

Expand Down Expand Up @@ -436,9 +436,9 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w

if states is not None:
td_map[self.train_model.states_ph] = states
td_map[self.train_model.masks_ph] = masks
td_map[self.train_model.dones_ph] = masks
td_map[self.polyak_model.states_ph] = states
td_map[self.polyak_model.masks_ph] = masks
td_map[self.polyak_model.dones_ph] = masks

if writer is not None:
# run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
Expand Down
6 changes: 3 additions & 3 deletions stable_baselines/acktr/acktr_disc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from stable_baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse, \
total_episode_reward_logger
from stable_baselines.acktr import kfac
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy
from stable_baselines.ppo2.ppo2 import safe_mean


Expand Down Expand Up @@ -123,7 +123,7 @@ def setup_model(self):

n_batch_step = None
n_batch_train = None
if issubclass(self.policy, LstmPolicy):
if issubclass(self.policy, RecurrentActorCriticPolicy):
n_batch_step = self.n_envs
n_batch_train = self.n_envs * self.n_steps

Expand Down Expand Up @@ -229,7 +229,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
self.pg_lr_ph: cur_lr}
if states is not None:
td_map[self.train_model.states_ph] = states
td_map[self.train_model.masks_ph] = masks
td_map[self.train_model.dones_ph] = masks

if writer is not None:
# run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/common/base_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import tensorflow as tf

from stable_baselines.common import set_global_seeds
from stable_baselines.common.policies import LstmPolicy, get_policy_from_name, ActorCriticPolicy
from stable_baselines.common.policies import get_policy_from_name, ActorCriticPolicy
from stable_baselines.common.vec_env import VecEnvWrapper, VecEnv, DummyVecEnv
from stable_baselines import logger

Expand Down Expand Up @@ -98,7 +98,7 @@ def set_env(self, env):
assert isinstance(env, VecEnv), \
"Error: the environment passed is not a vectorized environment, however {} requires it".format(
self.__class__.__name__)
assert not issubclass(self.policy, LstmPolicy) or self.n_envs == env.num_envs, \
assert not self.policy.recurrent or self.n_envs == env.num_envs, \
"Error: the environment passed must have the same number of environments as the model was trained on." \
"This is due to the Lstm policy not being capable of changing the number of environments."
self.n_envs = env.num_envs
Expand Down

0 comments on commit baa4aa5

Please sign in to comment.