Skip to content

Commit

Permalink
ActorCriticRLModel: Don't unnecessarily reinitialize Runner (#639)
Browse files Browse the repository at this point in the history
* Runner reset easy

* [f] Import thing

* [pickable] PPO1 progress

* Revert PPO1 progres

* Revert "Revert PPO1 progres"

This reverts commit 64c7bd2b6cbdffd74cdde8b4e8cd0ee0494ec38c.

* Tests are passing now that we rm TRPO-based models

* [pickable] Simplify total_episode_reward_logger

* Make runner initialization lazy

* Update tests

* Merge ActorCriticRLModel{,WithRunner} classes

* Fix acer typing by using `self.` rather than `self.runner.*`

* fix typo

* Fix lint, docs, etc

* Update changelog

* learn: Reuse `self.episode_reward`

* Fix bad merge

* Revert "learn: Reuse `self.episode_reward`"

This reverts commit c7592d08b095f0b4ea77a186e90c4523a2f3167e.

* Update stable_baselines/common/identity_env.py

Co-Authored-By: Adam Gleave <adam@gleave.me>

* Update tests/test_multiple_learn.py

Co-Authored-By: Adam Gleave <adam@gleave.me>

* Address comments

* Fix bad merge

* ActorCriticRLModel: Always reset Runner on

For consistency with logging statistic resets in BaseRLModel.
Responsibility for avoiding resets due to `set_env()` should lie
with the caller.

* IdentityEnv: Allow dim=None and space=None

This way we can satisfy both constraints:
(1) Subclasses don't have to set `dim=None` when they call
`super().__init__(space=space, ...)`.
(2) `test_envs.py` can call `IdentityEnv()` with no arguments.

* test_envs: Check all identity test envs

* identity_env: Fix typo

* lint

* Update stable_baselines/common/identity_env.py

Co-Authored-By: Adam Gleave <adam@gleave.me>

* Update stable_baselines/common/identity_env.py

Co-Authored-By: Adam Gleave <adam@gleave.me>

Co-authored-by: Adam Gleave <adam@gleave.me>
  • Loading branch information
shwang and AdamGleave committed Jan 22, 2020
1 parent d000b87 commit 4fada47
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 107 deletions.
7 changes: 7 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,20 @@ Bug Fixes:
- Fixed Docker GPU run script, `scripts/run_docker_gpu.sh`, to work with new NVidia Container Toolkit.
- Repeated calls to `RLModel.learn()` now preserve internal counters for some episode
logging statistics that used to be zeroed at the start of every call.
- Fixed a bug in PPO2, ACER, A2C, and ACKTR where repeated calls to `learn(total_timesteps)` reset
the environment on every call, potentially biasing samples toward early episode timesteps.
(@shwang)

- Fixed by adding lazy property `ActorCriticRLModel.runner`. Subclasses now use lazily-generated
`self.runner` instead of reinitializing a new Runner every time `learn()` is called.

Deprecations:
^^^^^^^^^^^^^

Others:
^^^^^^^
- Removed redundant return value from `a2c.utils::total_episode_reward_logger`. (@shwang)
- Cleanup and refactoring in `common/identity_env.py` (@shwang)

Documentation:
^^^^^^^^^^^^^^
Expand Down
18 changes: 8 additions & 10 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None):

super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

self.n_steps = n_steps
self.gamma = gamma
self.vf_coef = vf_coef
Expand All @@ -65,8 +61,6 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
self.tensorboard_log = tensorboard_log
self.full_tensorboard_log = full_tensorboard_log

self.graph = None
self.sess = None
self.learning_rate_ph = None
self.n_batch = None
self.actions_ph = None
Expand All @@ -75,21 +69,26 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
self.pg_loss = None
self.vf_loss = None
self.entropy = None
self.params = None
self.apply_backprop = None
self.train_model = None
self.step_model = None
self.step = None
self.proba_step = None
self.value = None
self.initial_state = None
self.learning_rate_schedule = None
self.summary = None

super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

# if we are loading, it is possible the environment is not known, however the obs and action space are known
if _init_setup_model:
self.setup_model()

def _make_runner(self) -> AbstractEnvRunner:
return A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)

def _get_pretrain_placeholders(self):
policy = self.train_model
if isinstance(self.action_space, gym.spaces.Discrete):
Expand Down Expand Up @@ -233,11 +232,10 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
schedule=self.lr_schedule)

runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
t_start = time.time()
for update in range(1, total_timesteps // self.n_batch + 1):
# true_reward is the reward without discount
obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run()
obs, states, rewards, masks, actions, values, ep_infos, true_reward = self.runner.run()
self.ep_info_buf.extend(ep_infos)
_, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
self.num_timesteps // self.n_batch, writer)
Expand Down
54 changes: 26 additions & 28 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,10 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0
_init_setup_model=True, policy_kwargs=None,
full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):

super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
if num_procs is not None:
warnings.warn("num_procs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
n_cpu_tf_sess = num_procs

self.n_steps = n_steps
self.replay_ratio = replay_ratio
Expand All @@ -135,35 +136,33 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0
self.tensorboard_log = tensorboard_log
self.full_tensorboard_log = full_tensorboard_log

if num_procs is not None:
warnings.warn("num_procs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
self.n_cpu_tf_sess = num_procs

self.graph = None
self.sess = None
self.action_ph = None
self.done_ph = None
self.reward_ph = None
self.mu_ph = None
self.learning_rate_ph = None
self.params = None
self.polyak_model = None
self.learning_rate_schedule = None
self.run_ops = None
self.names_ops = None
self.train_model = None
self.step_model = None
self.step = None
self.proba_step = None
self.initial_state = None
self.n_act = None
self.n_batch = None
self.summary = None

super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)


if _init_setup_model:
self.setup_model()

def _make_runner(self) -> AbstractEnvRunner:
return _Runner(env=self.env, model=self, n_steps=self.n_steps)

def _get_pretrain_placeholders(self):
policy = self.step_model
action_ph = policy.pdtype.sample_placeholder([None])
Expand Down Expand Up @@ -486,7 +485,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A

episode_stats = EpisodeStats(self.n_steps, self.n_envs)

runner = _Runner(env=self.env, model=self, n_steps=self.n_steps)
if self.replay_ratio > 0:
buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size)
else:
Expand All @@ -496,7 +494,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A

# n_batch samples, 1 on_policy call and multiple off-policy calls
for steps in range(0, total_timesteps, self.n_batch):
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run()
episode_stats.feed(rewards, dones)

if buffer is not None:
Expand All @@ -509,12 +507,12 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
writer, self.num_timesteps)

# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
actions = actions.reshape([runner.n_batch])
rewards = rewards.reshape([runner.n_batch])
mus = mus.reshape([runner.n_batch, runner.n_act])
dones = dones.reshape([runner.n_batch])
masks = masks.reshape([runner.batch_ob_shape[0]])
obs = obs.reshape(self.runner.batch_ob_shape)
actions = actions.reshape([self.n_batch])
rewards = rewards.reshape([self.n_batch])
mus = mus.reshape([self.n_batch, self.n_act])
dones = dones.reshape([self.n_batch])
masks = masks.reshape([self.runner.batch_ob_shape[0]])

names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
self.num_timesteps, writer)
Expand All @@ -525,7 +523,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
if callback(locals(), globals()) is False:
break

if self.verbose >= 1 and (int(steps / runner.n_batch) % log_interval == 0):
if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0):
logger.record_tabular("total_timesteps", self.num_timesteps)
logger.record_tabular("fps", int(steps / (time.time() - t_start)))
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
Expand All @@ -546,12 +544,12 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
obs, actions, rewards, mus, dones, masks = buffer.get()

# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
actions = actions.reshape([runner.n_batch])
rewards = rewards.reshape([runner.n_batch])
mus = mus.reshape([runner.n_batch, runner.n_act])
dones = dones.reshape([runner.n_batch])
masks = masks.reshape([runner.batch_ob_shape[0]])
obs = obs.reshape(self.runner.batch_ob_shape)
actions = actions.reshape([self.n_batch])
rewards = rewards.reshape([self.n_batch])
mus = mus.reshape([self.n_batch, self.n_act])
dones = dones.reshape([self.n_batch])
masks = masks.reshape([self.runner.batch_ob_shape[0]])

self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
self.num_timesteps)
Expand Down
40 changes: 19 additions & 21 deletions stable_baselines/acktr/acktr.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.
tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1,
gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):

super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
if nprocs is not None:
warnings.warn("nprocs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
n_cpu_tf_sess = nprocs

self.n_steps = n_steps
self.gamma = gamma
Expand All @@ -73,19 +74,12 @@ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.
self.learning_rate = learning_rate
self.lr_schedule = lr_schedule

if nprocs is not None:
warnings.warn("nprocs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
self.n_cpu_tf_sess = nprocs

self.tensorboard_log = tensorboard_log
self.async_eigen_decomp = async_eigen_decomp
self.full_tensorboard_log = full_tensorboard_log
self.kfac_update = kfac_update
self.gae_lambda = gae_lambda

self.graph = None
self.sess = None
self.actions_ph = None
self.advs_ph = None
self.rewards_ph = None
Expand All @@ -98,13 +92,11 @@ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.
self.pg_fisher = None
self.vf_fisher = None
self.joint_fisher = None
self.params = None
self.grads_check = None
self.optim = None
self.train_op = None
self.q_runner = None
self.learning_rate_schedule = None
self.step = None
self.proba_step = None
self.value = None
self.initial_state = None
Expand All @@ -113,9 +105,21 @@ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.
self.trained = False
self.continuous_actions = False

super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

if _init_setup_model:
self.setup_model()

def _make_runner(self):
if self.gae_lambda is not None:
return PPO2Runner(
env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.gae_lambda)
else:
return A2CRunner(
self.env, self, n_steps=self.n_steps, gamma=self.gamma)

def _get_pretrain_placeholders(self):
policy = self.train_model
if isinstance(self.action_space, Discrete):
Expand Down Expand Up @@ -316,12 +320,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A

self.trained = True

# Use GAE
if self.gae_lambda is not None:
runner = PPO2Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.gae_lambda)
else:
runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)

t_start = time.time()
coord = tf.train.Coordinator()
if self.q_runner is not None:
Expand All @@ -332,11 +330,11 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
for update in range(1, total_timesteps // self.n_batch + 1):
# pytype:disable=bad-unpacking
# true_reward is the reward without discount
if isinstance(runner, PPO2Runner):
if isinstance(self.runner, PPO2Runner):
# We are using GAE
obs, returns, masks, actions, values, _, states, ep_infos, true_reward = runner.run()
obs, returns, masks, actions, values, _, states, ep_infos, true_reward = self.runner.run()
else:
obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run()
obs, states, returns, masks, actions, values, ep_infos, true_reward = self.runner.run()
# pytype:enable=bad-unpacking

self.ep_info_buf.extend(ep_infos)
Expand Down
19 changes: 19 additions & 0 deletions stable_baselines/common/base_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from stable_baselines.common.misc_util import set_global_seeds
from stable_baselines.common.save_util import data_to_json, json_to_data, params_to_bytes, bytes_to_params
from stable_baselines.common.policies import get_policy_from_name, ActorCriticPolicy
from stable_baselines.common.runners import AbstractEnvRunner
from stable_baselines.common.vec_env import VecEnvWrapper, VecEnv, DummyVecEnv
from stable_baselines import logger

Expand Down Expand Up @@ -747,6 +748,24 @@ def __init__(self, policy, env, _init_setup_model, verbose=0, policy_base=ActorC
self.step = None
self.proba_step = None
self.params = None
self._runner = None

def _make_runner(self) -> AbstractEnvRunner:
"""Builds a new Runner.
Lazily called whenever `self.runner` is accessed and `self._runner is None`.
"""
raise NotImplementedError("This model is not configured to use a Runner")

@property
def runner(self) -> AbstractEnvRunner:
if self._runner is None:
self._runner = self._make_runner()
return self._runner

def set_env(self, env):
self._runner = None # New environment invalidates `self._runner`.
super().set_env(env)

@abstractmethod
def setup_model(self):
Expand Down

0 comments on commit 4fada47

Please sign in to comment.