Skip to content

Commit

Permalink
Simplify total_episode_reward_logger (#635)
Browse files Browse the repository at this point in the history
* [pickable] Simplify total_episode_reward_logger

* Update changelog

* [ci skip] Update version

* Update version number

Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
  • Loading branch information
shwang and araffin committed Dec 30, 2019
1 parent 98e9ee9 commit 6bdb7ce
Show file tree
Hide file tree
Showing 14 changed files with 58 additions and 39 deletions.
23 changes: 23 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,29 @@ Changelog
For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.


Pre-Release 2.10.0a0 (WIP)
--------------------------

Breaking Changes:
^^^^^^^^^^^^^^^^^

New Features:
^^^^^^^^^^^^^

Bug Fixes:
^^^^^^^^^^

Deprecations:
^^^^^^^^^^^^^

Others:
^^^^^^^
- Removed redundant return value from `a2c.utils::total_episode_reward_logger`. (@shwang)

Documentation:
^^^^^^^^^^^^^^


Release 2.9.0 (2019-12-20)
--------------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
license="MIT",
long_description=long_description,
long_description_content_type='text/markdown',
version="2.9.0",
version="2.10.0a0",
)

# python setup.py sdist
Expand Down
2 changes: 1 addition & 1 deletion stable_baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
from stable_baselines.trpo_mpi import TRPO
del mpi4py

__version__ = "2.9.0"
__version__ = "2.10.0a0"
8 changes: 4 additions & 4 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,10 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
fps = int((update * self.n_batch) / n_seconds)

if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)

self.num_timesteps += self.n_batch

Expand Down
4 changes: 0 additions & 4 deletions stable_baselines/a2c/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,6 @@ def total_episode_reward_logger(rew_acc, rewards, masks, writer, steps):
:param masks: (np.array bool) the end of episodes
:param writer: (TensorFlow Session.writer) the writer to log to
:param steps: (int) the current timestep
:return: (np.array float) the updated total running reward
:return: (np.array float) the updated total running reward
"""
with tf.variable_scope("environment_info", reuse=True):
for env_idx in range(rewards.shape[0]):
Expand All @@ -585,5 +583,3 @@ def total_episode_reward_logger(rew_acc, rewards, masks, writer, steps):
summary = tf.Summary(value=[tf.Summary.Value(tag="episode_reward", simple_value=rew_acc[env_idx])])
writer.add_summary(summary, steps + dones_idx[k, 0])
rew_acc[env_idx] = sum(rewards[env_idx, dones_idx[-1, 0]:])

return rew_acc
8 changes: 4 additions & 4 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,10 +505,10 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
buffer.put(enc_obs, actions, rewards, mus, dones, masks)

if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
rewards.reshape((self.n_envs, self.n_steps)),
dones.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
rewards.reshape((self.n_envs, self.n_steps)),
dones.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)

# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
Expand Down
8 changes: 4 additions & 4 deletions stable_baselines/acktr/acktr.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,10 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
fps = int((update * self.n_batch) / n_seconds)

if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)

if callback is not None:
# Only stop training if return value is False, not when it is None. This is for backwards
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,8 +885,8 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
if writer is not None:
ep_rew = np.array([reward]).reshape((1, -1))
ep_done = np.array([done]).reshape((1, -1))
self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done,
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward, ep_rew, ep_done,
writer, self.num_timesteps)
step += 1
total_steps += 1
self.num_timesteps += 1
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/deepq/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
if writer is not None:
ep_rew = np.array([rew]).reshape((1, -1))
ep_done = np.array([done]).reshape((1, -1))
self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
self.num_timesteps)
total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
self.num_timesteps)

episode_rewards[-1] += rew
if done:
Expand Down
8 changes: 4 additions & 4 deletions stable_baselines/ppo1/pposgd_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,10 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="P

# true_rew is the reward without discount
if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
seg["true_rewards"].reshape((self.n_envs, -1)),
seg["dones"].reshape((self.n_envs, -1)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
seg["true_rewards"].reshape((self.n_envs, -1)),
seg["dones"].reshape((self.n_envs, -1)),
writer, self.num_timesteps)

# predicted value function before udpate
vpredbefore = seg["vpred"]
Expand Down
8 changes: 4 additions & 4 deletions stable_baselines/ppo2/ppo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,10 @@ def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO
fps = int(self.n_batch / (t_now - t_start))

if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
true_reward.reshape((self.n_envs, self.n_steps)),
masks.reshape((self.n_envs, self.n_steps)),
writer, self.num_timesteps)

if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
explained_var = explained_variance(values, returns)
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ def learn(self, total_timesteps, callback=None,
# Write reward per episode to tensorboard
ep_reward = np.array([reward]).reshape((1, -1))
ep_done = np.array([done]).reshape((1, -1))
self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
ep_done, writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward, ep_reward,
ep_done, writer, self.num_timesteps)

if step % self.train_freq == 0:
mb_infos_vals = []
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/td3/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ def learn(self, total_timesteps, callback=None,
# Write reward per episode to tensorboard
ep_reward = np.array([reward]).reshape((1, -1))
ep_done = np.array([done]).reshape((1, -1))
self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
ep_done, writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward, ep_reward,
ep_done, writer, self.num_timesteps)

if step % self.train_freq == 0:
mb_infos_vals = []
Expand Down
10 changes: 5 additions & 5 deletions stable_baselines/trpo_mpi/trpo_mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,11 +340,11 @@ def fisher_vector_product(vec):

# true_rew is the reward without discount
if writer is not None:
self.episode_reward = total_episode_reward_logger(self.episode_reward,
seg["true_rewards"].reshape(
(self.n_envs, -1)),
seg["dones"].reshape((self.n_envs, -1)),
writer, self.num_timesteps)
total_episode_reward_logger(self.episode_reward,
seg["true_rewards"].reshape(
(self.n_envs, -1)),
seg["dones"].reshape((self.n_envs, -1)),
writer, self.num_timesteps)

args = seg["observations"], seg["observations"], seg["actions"], atarg
# Subsampling: see p40-42 of John Schulman thesis
Expand Down

0 comments on commit 6bdb7ce

Please sign in to comment.