Fix Determinism (#492)

* Add seed to distributions * Test if we can have reproducible results * Set random seed at graph creation * Remove doc * Try harder (remove parallelism) * Update test * Remove seed param from learn method * Bug fixes * Make results deterministic * Reduce number of training steps * Update version * Try separating tests * Remove unused import * Typos * Improve VecEnv seeding * Save seed and default to None * Update docs for seed parameters * Documentation about reproducibility
Stable-Baselines-Team · Oct 11, 2019 · 8a8baf1 · 8a8baf1
1 parent feeaff4
commit 8a8baf1
Show file tree

Hide file tree

Showing 31 changed files with 353 additions and 216 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -34,6 +34,9 @@ jobs:
     - name: "Unit Tests sb-z"
       env: TEST_GLOB="{s[b-z]*,[t-z]*}"
 
+    - name: "Unit Tests determinism"
+      env: TEST_GLOB="0deterministic.py"
+
     - name: "Sphinx Documentation"
       script:
         - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"'

diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst
@@ -53,3 +53,26 @@ Actions ``gym.spaces``:
 
   Some logging values (like `ep_rewmean`, `eplenmean`) are only available when using a Monitor wrapper
   See `Issue #339 <https://github.com/hill-a/stable-baselines/issues/339>`_ for more info.
+
+
+Reproducibility
+---------------
+
+Completely reproducible results are not guaranteed across Tensorflow releases or different platforms.
+Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds.
+
+In order to make make computations deterministic on CPU, on your specific problem on one specific platform,
+you need to pass a `seed` argument at the creation of a model and set `n_cpu_tf_sess=1` (number of cpu for Tensorflow session).
+If you pass an environment to the model using `set_env()`, then you also need to seed the environment first.
+
+.. note::
+
+  Because of the current limits of Tensorflow 1.x, we cannot ensure reproducible results on the GPU yet. We hope to solve that issue with Tensorflow 2.x support (cf `Issue #366 <https://github.com/hill-a/stable-baselines/issues/366>`_).
+
+
+.. note::
+
+  TD3 sometimes fail to have reproducible results for obscure reasons, even when following the previous steps (cf `PR #492 <https://github.com/hill-a/stable-baselines/pull/492>`_). If you find the reason then please open an issue ;)
+
+
+Credit: part of the *Reproducibility* section comes from `PyTorch Documentation <https://pytorch.org/docs/stable/notes/randomness.html>`_
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -6,20 +6,27 @@ Changelog
 For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.
 
 
-Pre-Release 2.8.1a0 (WIP)
+Pre-Release 2.9.0a0 (WIP)
 --------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- The `seed` argument has been moved from `learn()` method to model constructor
+  in order to have reproducible results
 
 New Features:
 ^^^^^^^^^^^^^
+- Add `n_cpu_tf_sess` to model constructor to choose the number of threads used by Tensorflow
 
 Bug Fixes:
 ^^^^^^^^^^
+- Fix seeding, so it is now possible to have deterministic results on cpu
+- Fix a bug in DDPG where `predict` method with `deterministic=False` would fail
 
 Deprecations:
 ^^^^^^^^^^^^^
+- `nprocs` (ACKTR) and `num_procs` (ACER) are deprecated in favor of `n_cpu_tf_sess` which is now common
+  to all algorithms
 
 Others:
 ^^^^^^^
@@ -508,4 +515,4 @@ In random order...
 Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
 @EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol
 @XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs
-@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150
+@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp
diff --git a/setup.py b/setup.py
@@ -140,7 +140,7 @@
       license="MIT",
       long_description=long_description,
       long_description_content_type='text/markdown',
-      version="2.8.0",
+      version="2.9.0a0",
       )
 
 # python setup.py sdist

diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py
@@ -20,4 +20,4 @@
     from stable_baselines.trpo_mpi import TRPO
 del mpi4py
 
-__version__ = "2.8.0"
+__version__ = "2.9.0a0"
diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py
@@ -38,14 +38,21 @@ class A2C(ActorCriticRLModel):
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
     :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
         WARNING: this logging can take a lot of space quickly
+    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
+        If None (default), use random seed. Note that if you want completely deterministic
+        results, you must set `n_cpu_tf_sess` to 1.
+    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
+        If None, the number of cpu of the current machine will be used.
     """
 
     def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5,
-                 learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
+                 learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0,
+                 tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
+                 full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None):
 
         super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
-                                  _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+                                  _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+                                  seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
         self.n_steps = n_steps
         self.gamma = gamma
@@ -99,7 +106,8 @@ def setup_model(self):
 
             self.graph = tf.Graph()
             with self.graph.as_default():
-                self.sess = tf_util.make_session(graph=self.graph)
+                self.set_random_seed(self.seed)
+                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
 
                 self.n_batch = self.n_envs * self.n_steps
 
@@ -216,15 +224,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         return policy_loss, value_loss, policy_entropy
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
+    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C",
               reset_num_timesteps=True):
 
         new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
         with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                 as writer:
-            self._setup_learn(seed)
-
+            self._setup_learn()
             self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                     schedule=self.lr_schedule)
 
@@ -288,6 +295,8 @@ def save(self, save_path, cloudpickle=False):
             "observation_space": self.observation_space,
             "action_space": self.action_space,
             "n_envs": self.n_envs,
+            "n_cpu_tf_sess": self.n_cpu_tf_sess,
+            "seed": self.seed,
             "_vectorize_action": self._vectorize_action,
             "policy_kwargs": self.policy_kwargs
         }

diff --git a/stable_baselines/a2c/run_atari.py b/stable_baselines/a2c/run_atari.py
@@ -30,8 +30,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
 
     env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
 
-    model = A2C(policy_fn, env, lr_schedule=lr_schedule)
-    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+    model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
+    model.learn(total_timesteps=int(num_timesteps * 1.1))
     env.close()
 
 

diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
@@ -69,6 +69,10 @@ class ACER(ActorCriticRLModel):
     :param n_steps: (int) The number of steps to run for each environment per update
         (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
     :param num_procs: (int) The number of threads for TensorFlow operations
+
+        .. deprecated:: 2.9.0
+            Use `n_cpu_tf_sess` instead.
+
     :param q_coef: (float) The weight for the loss on the Q value
     :param ent_coef: (float) The weight for the entropic loss
     :param max_grad_norm: (float) The clipping value for the maximum gradient
@@ -93,16 +97,23 @@ class ACER(ActorCriticRLModel):
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
     :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
         WARNING: this logging can take a lot of space quickly
+    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
+        If None (default), use random seed. Note that if you want completely deterministic
+        results, you must set `n_cpu_tf_sess` to 1.
+    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
+        If None, the number of cpu of the current machine will be used.
     """
 
-    def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
+    def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
                  learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000,
                  replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True,
                  alpha=0.99, delta=1, verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
+                 _init_setup_model=True, policy_kwargs=None,
+                 full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):
 
         super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
-                                   _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+                                   _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+                                   seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
         self.n_steps = n_steps
         self.replay_ratio = replay_ratio
@@ -120,10 +131,14 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5,
         self.rprop_epsilon = rprop_epsilon
         self.learning_rate = learning_rate
         self.lr_schedule = lr_schedule
-        self.num_procs = num_procs
         self.tensorboard_log = tensorboard_log
         self.full_tensorboard_log = full_tensorboard_log
 
+        if num_procs is not None:
+            warnings.warn("num_procs will be removed in a future version (v3.x.x) "
+                          "use n_cpu_tf_sess instead", DeprecationWarning)
+            self.n_cpu_tf_sess = num_procs
+
         self.graph = None
         self.sess = None
         self.action_ph = None
@@ -184,8 +199,8 @@ def setup_model(self):
 
             self.graph = tf.Graph()
             with self.graph.as_default():
-                self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph)
-
+                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
+                self.set_random_seed(self.seed)
                 n_batch_step = None
                 if issubclass(self.policy, RecurrentActorCriticPolicy):
                     n_batch_step = self.n_envs
@@ -457,14 +472,14 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w
 
         return self.names_ops, step_return[1:]  # strip off _train
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER",
+    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER",
               reset_num_timesteps=True):
 
         new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
         with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                 as writer:
-            self._setup_learn(seed)
+            self._setup_learn()
 
             self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
                                                     schedule=self.lr_schedule)
@@ -562,6 +577,8 @@ def save(self, save_path, cloudpickle=False):
             "observation_space": self.observation_space,
             "action_space": self.action_space,
             "n_envs": self.n_envs,
+            'n_cpu_tf_sess': self.n_cpu_tf_sess,
+            'seed': self.seed,
             "_vectorize_action": self._vectorize_action,
             "policy_kwargs": self.policy_kwargs
         }

diff --git a/stable_baselines/acer/run_atari.py b/stable_baselines/acer/run_atari.py
@@ -28,8 +28,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
         warnings.warn("Policy {} not implemented".format(policy))
         return
 
-    model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
-    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+    model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000, seed=seed)
+    model.learn(total_timesteps=int(num_timesteps * 1.1))
     env.close()
     # Free memory
     del model

diff --git a/stable_baselines/acktr/acktr.py b/stable_baselines/acktr/acktr.py
@@ -1,4 +1,5 @@
 import time
+import warnings
 from collections import deque
 
 import numpy as np
@@ -24,6 +25,10 @@ class ACKTR(ActorCriticRLModel):
     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
     :param gamma: (float) Discount factor
     :param nprocs: (int) The number of threads for TensorFlow operations
+
+        .. deprecated:: 2.9.0
+            Use `n_cpu_tf_sess` instead.
+
     :param n_steps: (int) The number of steps to run for each environment
     :param ent_coef: (float) The weight for the entropic loss
     :param vf_coef: (float) The weight for the loss on the value function
@@ -43,15 +48,21 @@ class ACKTR(ActorCriticRLModel):
         If None (default), then the classic advantage will be used instead of GAE
     :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
         WARNING: this logging can take a lot of space quickly
+    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
+        If None (default), use random seed. Note that if you want completely deterministic
+        results, you must set `n_cpu_tf_sess` to 1.
+    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
+        If None, the number of cpu of the current machine will be used.
     """
 
-    def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
+    def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
                  learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1,
-                 gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False):
+                 gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):
 
         super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
-                                    _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+                                    _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+                                    seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
         self.n_steps = n_steps
         self.gamma = gamma
@@ -62,7 +73,12 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01,
         self.max_grad_norm = max_grad_norm
         self.learning_rate = learning_rate
         self.lr_schedule = lr_schedule
-        self.nprocs = nprocs
+
+        if nprocs is not None:
+            warnings.warn("nprocs will be removed in a future version (v3.x.x) "
+                          "use n_cpu_tf_sess instead", DeprecationWarning)
+            self.n_cpu_tf_sess = nprocs
+
         self.tensorboard_log = tensorboard_log
         self.async_eigen_decomp = async_eigen_decomp
         self.full_tensorboard_log = full_tensorboard_log
@@ -119,7 +135,8 @@ def setup_model(self):
 
             self.graph = tf.Graph()
             with self.graph.as_default():
-                self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph)
+                self.set_random_seed(self.seed)
+                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
 
                 n_batch_step = None
                 n_batch_train = None
@@ -264,14 +281,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         return policy_loss, value_loss, policy_entropy
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR",
+    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR",
               reset_num_timesteps=True):
 
         new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
         with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                 as writer:
-            self._setup_learn(seed)
+            self._setup_learn()
             self.n_batch = self.n_envs * self.n_steps
 
             self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
@@ -371,7 +388,6 @@ def save(self, save_path, cloudpickle=False):
         data = {
             "gamma": self.gamma,
             "gae_lambda": self.gae_lambda,
-            "nprocs": self.nprocs,
             "n_steps": self.n_steps,
             "vf_coef": self.vf_coef,
             "ent_coef": self.ent_coef,
@@ -385,6 +401,8 @@ def save(self, save_path, cloudpickle=False):
             "observation_space": self.observation_space,
             "action_space": self.action_space,
             "n_envs": self.n_envs,
+            "n_cpu_tf_sess": self.n_cpu_tf_sess,
+            "seed": self.seed,
             "kfac_update": self.kfac_update,
             "_vectorize_action": self._vectorize_action,
             "policy_kwargs": self.policy_kwargs

diff --git a/stable_baselines/acktr/run_atari.py b/stable_baselines/acktr/run_atari.py
@@ -14,8 +14,8 @@ def train(env_id, num_timesteps, seed, num_cpu):
     :param num_cpu: (int) The number of cpu to train on
     """
     env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
-    model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
-    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+    model = ACKTR(CnnPolicy, env, nprocs=num_cpu, seed=seed)
+    model.learn(total_timesteps=int(num_timesteps * 1.1))
     env.close()