Skip to content

Commit

Permalink
RLModel.learn: Reuse logging statistics after each call (#649)
Browse files Browse the repository at this point in the history
* learn: Reuse `self.episode_reward`

(cherry picked from commit c7592d08b095f0b4ea77a186e90c4523a2f3167e)

* Also preserve ep_info_buf
  • Loading branch information
shwang authored and araffin committed Jan 12, 2020
1 parent c7084c8 commit 483960a
Show file tree
Hide file tree
Showing 12 changed files with 33 additions and 57 deletions.
2 changes: 2 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Bug Fixes:
^^^^^^^^^^

- Fixed Docker build script, `scripts/build_docker.sh`, to pass `USE_GPU` build argument.
- Repeated calls to `RLModel.learn()` now preserve internal counters for some episode
logging statistics that used to be zeroed at the start of every call.

Deprecations:
^^^^^^^^^^^^^
Expand Down
14 changes: 4 additions & 10 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import time
from collections import deque

import gym
import numpy as np
Expand Down Expand Up @@ -86,7 +85,6 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
self.initial_state = None
self.learning_rate_schedule = None
self.summary = None
self.episode_reward = None

# if we are loading, it is possible the environment is not known, however the obs and action space are known
if _init_setup_model:
Expand Down Expand Up @@ -236,15 +234,11 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
schedule=self.lr_schedule)

runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)
self.episode_reward = np.zeros((self.n_envs,))
# Training stats (when using Monitor wrapper)
ep_info_buf = deque(maxlen=100)

t_start = time.time()
for update in range(1, total_timesteps // self.n_batch + 1):
# true_reward is the reward without discount
obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run()
ep_info_buf.extend(ep_infos)
self.ep_info_buf.extend(ep_infos)
_, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
self.num_timesteps // self.n_batch, writer)
n_seconds = time.time() - t_start
Expand Down Expand Up @@ -272,9 +266,9 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(explained_var))
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
logger.dump_tabular()

return self
Expand Down
2 changes: 0 additions & 2 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0
self.n_act = None
self.n_batch = None
self.summary = None
self.episode_reward = None

if _init_setup_model:
self.setup_model()
Expand Down Expand Up @@ -488,7 +487,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
episode_stats = EpisodeStats(self.n_steps, self.n_envs)

runner = _Runner(env=self.env, model=self, n_steps=self.n_steps)
self.episode_reward = np.zeros((self.n_envs,))
if self.replay_ratio > 0:
buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size)
else:
Expand Down
15 changes: 4 additions & 11 deletions stable_baselines/acktr/acktr.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
import warnings
from collections import deque

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -111,7 +110,6 @@ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.
self.initial_state = None
self.n_batch = None
self.summary = None
self.episode_reward = None
self.trained = False
self.continuous_actions = False

Expand Down Expand Up @@ -324,18 +322,13 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
else:
runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma)

self.episode_reward = np.zeros((self.n_envs,))

t_start = time.time()
coord = tf.train.Coordinator()
if self.q_runner is not None:
enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True)
else:
enqueue_threads = []

# Training stats (when using Monitor wrapper)
ep_info_buf = deque(maxlen=100)

for update in range(1, total_timesteps // self.n_batch + 1):
# pytype:disable=bad-unpacking
# true_reward is the reward without discount
Expand All @@ -346,7 +339,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run()
# pytype:enable=bad-unpacking

ep_info_buf.extend(ep_infos)
self.ep_info_buf.extend(ep_infos)
policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values,
self.num_timesteps // (self.n_batch + 1),
writer)
Expand Down Expand Up @@ -374,9 +367,9 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(explained_var))
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
logger.dump_tabular()

self.num_timesteps += self.n_batch + 1
Expand Down
11 changes: 11 additions & 0 deletions stable_baselines/common/base_class.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from collections import deque
import os
import glob
import warnings
Expand Down Expand Up @@ -57,6 +58,8 @@ def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base,
self.seed = seed
self._param_load_ops = None
self.n_cpu_tf_sess = n_cpu_tf_sess
self.episode_reward = None
self.ep_info_buf = None

if env is not None:
if isinstance(env, str):
Expand Down Expand Up @@ -138,6 +141,10 @@ def set_env(self, env):

self.env = env

# Invalidated by environment change.
self.episode_reward = None
self.ep_info_buf = None

def _init_num_timesteps(self, reset_num_timesteps=True):
"""
Initialize and resets num_timesteps (total timesteps since beginning of training)
Expand Down Expand Up @@ -189,6 +196,10 @@ def _setup_learn(self):
if self.env is None:
raise ValueError("Error: cannot train the model without a valid environment, please set an environment with"
"set_env(self, env) method.")
if self.episode_reward is None:
self.episode_reward = np.zeros((self.n_envs,))
if self.ep_info_buf is None:
self.ep_info_buf = deque(maxlen=100)

@abstractmethod
def get_parameter_list(self):
Expand Down
2 changes: 0 additions & 2 deletions stable_baselines/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,6 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
self.adaptive_param_noise_actor = None
self.params = None
self.summary = None
self.episode_reward = None
self.tb_seen_steps = None

self.target_params = None
Expand Down Expand Up @@ -826,7 +825,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D

eval_episode_rewards_history = deque(maxlen=100)
episode_rewards_history = deque(maxlen=100)
self.episode_reward = np.zeros((1,))
episode_successes = []
with self.sess.as_default(), self.graph.as_default():
# Prepare everything.
Expand Down
2 changes: 0 additions & 2 deletions stable_baselines/deepq/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=5000
self.exploration = None
self.params = None
self.summary = None
self.episode_reward = None

if _init_setup_model:
self.setup_model()
Expand Down Expand Up @@ -187,7 +186,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D
episode_successes = []
obs = self.env.reset()
reset = True
self.episode_reward = np.zeros((1,))

for _ in range(total_timesteps):
if callback is not None:
Expand Down
3 changes: 0 additions & 3 deletions stable_baselines/ppo1/pposgd_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_p
self.proba_step = None
self.initial_state = None
self.summary = None
self.episode_reward = None

if _init_setup_model:
self.setup_model()
Expand Down Expand Up @@ -221,8 +220,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="P
# rolling buffer for episode rewards
rewbuffer = deque(maxlen=100)

self.episode_reward = np.zeros((self.n_envs,))

while True:
if callback is not None:
# Only stop training if return value is False, not when it is None. This is for backwards
Expand Down
13 changes: 4 additions & 9 deletions stable_baselines/ppo2/ppo2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
import sys
from collections import deque

import gym
import numpy as np
Expand Down Expand Up @@ -98,7 +96,6 @@ def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning
self.initial_state = None
self.n_batch = None
self.summary = None
self.episode_reward = None

if _init_setup_model:
self.setup_model()
Expand Down Expand Up @@ -317,9 +314,7 @@ def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO
self._setup_learn()

runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
self.episode_reward = np.zeros((self.n_envs,))

ep_info_buf = deque(maxlen=100)
t_first_start = time.time()

n_updates = total_timesteps // self.n_batch
Expand All @@ -338,7 +333,7 @@ def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO
# true_reward is the reward without discount
obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run()
self.num_timesteps += self.n_batch
ep_info_buf.extend(ep_infos)
self.ep_info_buf.extend(ep_infos)
mb_loss_vals = []
if states is None: # nonrecurrent version
update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
Expand Down Expand Up @@ -390,9 +385,9 @@ def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO
logger.logkv("total_timesteps", self.num_timesteps)
logger.logkv("fps", fps)
logger.logkv("explained_variance", float(explained_var))
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
logger.logkv('time_elapsed', t_start - t_first_start)
for (loss_val, loss_name) in zip(loss_vals, self.loss_names):
logger.logkv(loss_name, loss_val)
Expand Down
12 changes: 4 additions & 8 deletions stable_baselines/sac/sac.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import sys
import time
from collections import deque
import warnings

import numpy as np
Expand Down Expand Up @@ -105,7 +104,6 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
self.value_fn = None
self.graph = None
self.replay_buffer = None
self.episode_reward = None
self.sess = None
self.tensorboard_log = tensorboard_log
self.verbose = verbose
Expand Down Expand Up @@ -390,8 +388,6 @@ def learn(self, total_timesteps, callback=None,
if self.action_noise is not None:
self.action_noise.reset()
obs = self.env.reset()
self.episode_reward = np.zeros((1,))
ep_info_buf = deque(maxlen=100)
n_updates = 0
infos_values = []

Expand Down Expand Up @@ -431,7 +427,7 @@ def learn(self, total_timesteps, callback=None,
# Retrieve reward and episode length if using Monitor wrapper
maybe_ep_info = info.get('episode')
if maybe_ep_info is not None:
ep_info_buf.extend([maybe_ep_info])
self.ep_info_buf.extend([maybe_ep_info])

if writer is not None:
# Write reward per episode to tensorboard
Expand Down Expand Up @@ -487,9 +483,9 @@ def learn(self, total_timesteps, callback=None,
fps = int(step / (time.time() - start_time))
logger.logkv("episodes", num_episodes)
logger.logkv("mean 100 episode reward", mean_reward)
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
logger.logkv("n_updates", n_updates)
logger.logkv("current_lr", current_lr)
logger.logkv("fps", fps)
Expand Down
12 changes: 4 additions & 8 deletions stable_baselines/td3/td3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import sys
import time
from collections import deque
import warnings

import numpy as np
Expand Down Expand Up @@ -87,7 +86,6 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000

self.graph = None
self.replay_buffer = None
self.episode_reward = None
self.sess = None
self.tensorboard_log = tensorboard_log
self.verbose = verbose
Expand Down Expand Up @@ -301,8 +299,6 @@ def learn(self, total_timesteps, callback=None,
if self.action_noise is not None:
self.action_noise.reset()
obs = self.env.reset()
self.episode_reward = np.zeros((1,))
ep_info_buf = deque(maxlen=100)
n_updates = 0
infos_values = []

Expand Down Expand Up @@ -342,7 +338,7 @@ def learn(self, total_timesteps, callback=None,
# Retrieve reward and episode length if using Monitor wrapper
maybe_ep_info = info.get('episode')
if maybe_ep_info is not None:
ep_info_buf.extend([maybe_ep_info])
self.ep_info_buf.extend([maybe_ep_info])

if writer is not None:
# Write reward per episode to tensorboard
Expand Down Expand Up @@ -398,9 +394,9 @@ def learn(self, total_timesteps, callback=None,
fps = int(step / (time.time() - start_time))
logger.logkv("episodes", num_episodes)
logger.logkv("mean 100 episode reward", mean_reward)
if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
logger.logkv("n_updates", n_updates)
logger.logkv("current_lr", current_lr)
logger.logkv("fps", fps)
Expand Down
2 changes: 0 additions & 2 deletions stable_baselines/trpo_mpi/trpo_mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.0
self.initial_state = None
self.params = None
self.summary = None
self.episode_reward = None

if _init_setup_model:
self.setup_model()
Expand Down Expand Up @@ -287,7 +286,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="T
t_start = time.time()
len_buffer = deque(maxlen=40) # rolling buffer for episode lengths
reward_buffer = deque(maxlen=40) # rolling buffer for episode rewards
self.episode_reward = np.zeros((self.n_envs,))

true_reward_buffer = None
if self.using_gail:
Expand Down

0 comments on commit 483960a

Please sign in to comment.