Merge pull request #13 from hill-a/more-docs

More docs
Stable-Baselines-Team · Aug 30, 2018 · f85571c · f85571c
2 parents a20d30e + 2720508
commit f85571c
Show file tree

Hide file tree

Showing 26 changed files with 139 additions and 55 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -9,7 +9,7 @@ For download links, please look at `Github release page <https://github.com/hill
 Master version 1.0.8.a0 (WIP)
 =============================
 
-Nothing new for now...
+- added more documentation (some modules from common).
 
 Tensorboard support in progress (see ``tensorboard`` branch)
 

diff --git a/docs/common/cmd_utils.rst b/docs/common/cmd_utils.rst
@@ -0,0 +1,7 @@
+.. _cmd_utils:
+
+Command Utils
+=========================
+
+.. automodule:: stable_baselines.common.cmd_util
+  :members:
diff --git a/docs/common/distributions.rst b/docs/common/distributions.rst
@@ -0,0 +1,24 @@
+.. _distributions:
+
+Probability Distributions
+=========================
+
+Probability distributions used for the different action spaces:
+
+- ``CategoricalProbabilityDistribution`` -> Discrete
+- ``DiagGaussianProbabilityDistribution`` -> Box (continuous actions)
+- ``MultiCategoricalProbabilityDistribution`` -> MultiDiscrete
+- ``BernoulliProbabilityDistribution`` -> MultiBinary
+
+The policy networks output parameters for the distributions (named `flat` in the methods).
+Actions are then sampled from those distributions.
+
+For instance, in the case of discrete actions. The policy network outputs probability
+of taking each action. The ``CategoricalProbabilityDistribution`` allows to sample from it,
+computes the entropy, the negative log probability (``neglogp``) and backpropagate the gradient.
+
+In the case of continuous actions, a Gaussian distribution is used. The policy network outputs
+mean and (log) std of the distribution (assumed to be a ``DiagGaussianProbabilityDistribution``).
+
+.. automodule:: stable_baselines.common.distributions
+  :members:
diff --git a/docs/common/schedules.rst b/docs/common/schedules.rst
@@ -0,0 +1,11 @@
+.. _schedules:
+
+Schedules
+=========
+
+Schedules are used as hyperparameter for most of the algortihms,
+in order to change value of a parameter over time (usuallly the learning rate).
+
+
+.. automodule:: stable_baselines.common.schedules
+  :members:
diff --git a/docs/common/tf_utils.rst b/docs/common/tf_utils.rst
@@ -0,0 +1,7 @@
+.. _tf_utils:
+
+Tensorflow Utils
+=========================
+
+.. automodule:: stable_baselines.common.tf_util
+  :members:
diff --git a/docs/conf.py b/docs/conf.py
@@ -28,11 +28,12 @@ def __getattr__(cls, name):
 # Mock modules that requires C modules
 # Note: because of that we cannot test examples using CI
 MOCK_MODULES = ['joblib', 'scipy', 'scipy.signal',
-                'numpy', 'pandas', 'mpi4py', 'mujoco-py', 'cv2', 'tensorflow',
+                'pandas', 'mpi4py', 'mujoco-py', 'cv2', 'tensorflow',
                 'tensorflow.contrib', 'tensorflow.contrib.layers',
                 'tensorflow.python', 'tensorflow.python.client', 'tensorflow.python.ops',
                 'tqdm', 'cloudpickle', 'matplotlib',
-                'seaborn', 'gym', 'gym.spaces', 'zmq']
+                'seaborn', 'gym', 'gym.spaces', 'gym.core',
+                'gym.wrappers', 'zmq']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
 import stable_baselines

diff --git a/docs/index.rst b/docs/index.rst
@@ -56,12 +56,22 @@ This toolset is a fork of OpenAI Baselines, with a major structural refactoring,
   modules/ppo2
   modules/trpo
 
+.. toctree::
+  :maxdepth: 1
+  :caption: Common
+
+  common/distributions
+  common/tf_utils
+  common/cmd_utils
+  common/schedules
+
 .. toctree::
   :maxdepth: 1
   :caption: Misc
 
   changelog
 
+
 Citing Stable Baselines
 -----------------------
 To cite this project in publications:

diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst
@@ -75,3 +75,4 @@ Parameters
 
 .. autoclass:: A2C
   :members:
+  :inherited-members:
diff --git a/docs/modules/acer.rst b/docs/modules/acer.rst
@@ -72,3 +72,4 @@ Parameters
 
 .. autoclass:: ACER
   :members:
+  :inherited-members:
diff --git a/docs/modules/acktr.rst b/docs/modules/acktr.rst
@@ -72,3 +72,4 @@ Parameters
 
 .. autoclass:: ACKTR
   :members:
+  :inherited-members:
diff --git a/docs/modules/base.rst b/docs/modules/base.rst
@@ -6,7 +6,7 @@
 Base RL Class
 =============
 
-Common interface for all the RL algorithms.
+Common interface for all the RL algorithms
 
 .. autoclass:: BaseRLModel
   :members:
diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst
@@ -72,3 +72,4 @@ Parameters
 
 .. autoclass:: DDPG
   :members:
+  :inherited-members:
diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst
@@ -97,3 +97,4 @@ Parameters
 
 .. autoclass:: DeepQ
   :members:
+  :inherited-members:
diff --git a/docs/modules/gail.rst b/docs/modules/gail.rst
@@ -93,3 +93,4 @@ Parameters
 
 .. autoclass:: GAIL
   :members:
+  :inherited-members:
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
@@ -65,3 +65,4 @@ Parameters
 
 .. autoclass:: HER
   :members:
+	:inherited-members:
diff --git a/docs/modules/ppo1.rst b/docs/modules/ppo1.rst
@@ -80,3 +80,4 @@ Parameters
 
 .. autoclass:: PPO1
   :members:
+  :inherited-members:
diff --git a/docs/modules/ppo2.rst b/docs/modules/ppo2.rst
@@ -87,3 +87,4 @@ Parameters
 
 .. autoclass:: PPO2
   :members:
+  :inherited-members:
diff --git a/docs/modules/trpo.rst b/docs/modules/trpo.rst
@@ -71,3 +71,4 @@ Parameters
 
 .. autoclass:: TRPO
   :members:
+  :inherited-members:
diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py
@@ -17,13 +17,15 @@ class A2C(BaseRLModel):
     :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...)
     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
     :param gamma: (float) Discount factor
-    :param n_steps: (int) The number of steps to run for each environment
+    :param n_steps: (int) The number of steps to run for each environment per update
+        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
     :param vf_coef: (float) Value function coefficient for the loss calculation
     :param ent_coef: (float) Entropy coefficient for the loss caculation
     :param max_grad_norm: (float) The maximum value for the gradient clipping
     :param learning_rate: (float) The learning rate
-    :param alpha: (float) RMS prop optimizer decay
-    :param epsilon: (float) RMS prop optimizer epsilon
+    :param alpha: (float)  RMSProp decay parameter (default: 0.99)
+    :param epsilon: (float) RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update)
+        (default: 1e-5)
     :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                               'double_linear_con', 'middle_drop' or 'double_middle_drop')
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug

diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
@@ -66,24 +66,27 @@ class ACER(BaseRLModel):
     :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...)
     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
     :param gamma: (float) The discount value
-    :param n_steps: (int) The number of steps to run for each environment
+    :param n_steps: (int) The number of steps to run for each environment per update
+        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
     :param num_procs: (int) The number of threads for TensorFlow operations
     :param q_coef: (float) The weight for the loss on the Q value
     :param ent_coef: (float) The weight for the entropic loss
     :param max_grad_norm: (float) The clipping value for the maximum gradient
     :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
     :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                               'double_linear_con', 'middle_drop' or 'double_middle_drop')
-    :param rprop_epsilon: (float) RMS prop optimizer epsilon
-    :param rprop_alpha: (float) RMS prop optimizer decay
+    :param rprop_epsilon: (float) RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update)
+        (default: 1e-5)
+    :param rprop_alpha: (float) RMSProp decay parameter (default: 0.99)
     :param buffer_size: (int) The buffer size in number of steps
     :param replay_ratio: (float) The number of replay learning per on policy learning on average,
                          using a poisson distribution
     :param replay_start: (int) The minimum number of steps in the buffer, before learning replay
-    :param correction_term: (float) The correction term for the weights
-    :param trust_region: (bool) Enable Trust region policy optimization loss
+    :param correction_term: (float) Importance weight clipping factor (default: 10)
+    :param trust_region: (bool) Whether or not algorithms estimates the gradient KL divergence
+        between the old and updated policy and uses it to determine step size  (default: True)
     :param alpha: (float) The decay rate for the Exponential moving average of the parameters
-    :param delta: (float) trust region delta value
+    :param delta: (float) max KL divergence between the old policy and updated policy (default: 1)
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """

diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py
@@ -87,7 +87,8 @@ def probability_distribution_class(self):
 
     def proba_distribution_from_flat(self, flat):
         """
-        returns the probability distribution from flat probabilities
+        Returns the probability distribution from flat probabilities
+        flat: flattened vector of parameters of probability distribution
 
         :param flat: ([float]) the flat probabilities
         :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated

diff --git a/stable_baselines/common/schedules.py b/stable_baselines/common/schedules.py
@@ -1,5 +1,6 @@
 """This file is used for specifying various schedules that evolve over
 time throughout the execution of the algorithm, such as:
+
  - learning rate for the optimizer
  - exploration epsilon for the epsilon greedy exploration strategy
  - beta parameter for beta parameter in prioritized replay
@@ -21,12 +22,13 @@ def value(self, step):
 
 
 class ConstantSchedule(Schedule):
-    def __init__(self, value):
-        """
-        Value remains constant over time.
+    """
+    Value remains constant over time.
 
-        :param value: (float) Constant value of the schedule
-        """
+    :param value: (float) Constant value of the schedule
+    """
+
+    def __init__(self, value):
         self._value = value
 
     def value(self, step):
@@ -35,36 +37,39 @@ def value(self, step):
 
 def linear_interpolation(left, right, alpha):
     """
-    Linear interpolation between `left` and `right`
+    Linear interpolation between `left` and `right`.
+
     :param left: (float) left boundary
     :param right: (float) right boundary
     :param alpha: (float) coeff in [0, 1]
     :return: (float)
     """
+
     return left + alpha * (right - left)
 
 
 class PiecewiseSchedule(Schedule):
+    """
+    Piecewise schedule.
+
+    :param endpoints: ([(int, int)])
+        list of pairs `(time, value)` meanining that schedule should output
+        `value` when `t==time`. All the values for time must be sorted in
+        an increasing order. When t is between two times, e.g. `(time_a, value_a)`
+        and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
+        `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
+        time passed between `time_a` and `time_b` for time `t`.
+    :param interpolation: (lambda (float, float, float): float)
+        a function that takes value to the left and to the right of t according
+        to the `endpoints`. Alpha is the fraction of distance from left endpoint to
+        right endpoint that t has covered. See linear_interpolation for example.
+    :param outside_value: (float)
+        if the value is requested outside of all the intervals sepecified in
+        `endpoints` this value is returned. If None then AssertionError is
+        raised when outside value is requested.
+    """
+
     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
-        """
-        Piecewise schedule.
-
-        :param endpoints: ([(int, int)])
-            list of pairs `(time, value)` meanining that schedule should output
-            `value` when `t==time`. All the values for time must be sorted in
-            an increasing order. When t is between two times, e.g. `(time_a, value_a)`
-            and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
-            `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
-            time passed between `time_a` and `time_b` for time `t`.
-        :param interpolation: (lambda (float, float, float): float)
-            a function that takes value to the left and to the right of t according
-            to the `endpoints`. Alpha is the fraction of distance from left endpoint to
-            right endpoint that t has covered. See linear_interpolation for example.
-        :param outside_value: (float)
-            if the value is requested outside of all the intervals sepecified in
-            `endpoints` this value is returned. If None then AssertionError is
-            raised when outside value is requested.
-        """
         idxes = [e[0] for e in endpoints]
         assert idxes == sorted(idxes)
         self._interpolation = interpolation
@@ -83,16 +88,17 @@ def value(self, step):
 
 
 class LinearSchedule(Schedule):
-    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
-        """
-        Linear interpolation between initial_p and final_p over
-        schedule_timesteps. After this many timesteps pass final_p is
-        returned.
+    """
+    Linear interpolation between initial_p and final_p over
+    schedule_timesteps. After this many timesteps pass final_p is
+    returned.
 
-        :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p
-        :param initial_p: (float) initial output value
-        :param final_p: (float) final output value
-        """
+    :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p
+    :param initial_p: (float) initial output value
+    :param final_p: (float) final output value
+    """
+
+    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
         self.schedule_timesteps = schedule_timesteps
         self.final_p = final_p
         self.initial_p = initial_p

diff --git a/stable_baselines/common/tf_util.py b/stable_baselines/common/tf_util.py
@@ -445,7 +445,7 @@ def load_state(fname, sess=None, var_list=None):
 
     :param fname: (str) the graph name
     :param sess: (TensorFlow Session) the session, if None: get_default_session()
-    :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject,
+    :param var_list: ([TensorFlow Tensor] or dict(str: TensorFlow Tensor)) A list of Variable/SaveableObject,
         or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects.
     """
     if sess is None:
@@ -464,8 +464,8 @@ def save_state(fname, sess=None, var_list=None):
     Save a TensorFlow model
 
     :param fname: (str) the graph name
-    :param sess: (TensorFlow Session) the session, if None: get_default_session()
-    :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject,
+    :param sess: (TensorFlow Session) The tf session, if None, get_default_session()
+    :param var_list: ([TensorFlow Tensor] or dict(str: TensorFlow Tensor)) A list of Variable/SaveableObject,
         or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects.
     """
     if sess is None:

diff --git a/stable_baselines/ppo1/pposgd_simple.py b/stable_baselines/ppo1/pposgd_simple.py
@@ -31,7 +31,7 @@ class PPO1(BaseRLModel):
     :param lam: (float) advantage estimation
     :param adam_epsilon: (float) the epsilon value for the adam optimizer
     :param schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
-    'double_linear_con', 'middle_drop' or 'double_middle_drop')
+        'double_linear_con', 'middle_drop' or 'double_middle_drop')
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """

diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py
@@ -20,13 +20,15 @@ class PPO2(BaseRLModel):
     :param policy: (ActorCriticPolicy) The policy model to use (MLP, CNN, LSTM, ...)
     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
     :param gamma: (float) Discount factor
-    :param n_steps: (int) The number of steps to run for each environment
+    :param n_steps: (int) The number of steps to run for each environment per update
+        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
     :param ent_coef: (float) Entropy coefficient for the loss caculation
     :param learning_rate: (float or callable) The learning rate, it can be a function
     :param vf_coef: (float) Value function coefficient for the loss calculation
     :param max_grad_norm: (float) The maximum value for the gradient clipping
     :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator
-    :param nminibatches: (int) Number of minibatches for the policies
+    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
+        should be smaller or equal than number of environments run in parallel.
     :param noptepochs: (int) Number of epoch when optimizing the surrogate
     :param cliprange: (float or callable) Clipping parameter, it can be a function
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug

diff --git a/tests/test_identity.py b/tests/test_identity.py
@@ -19,7 +19,7 @@
                    n_steps=1, replay_ratio=1).learn(total_timesteps=10000, seed=0),
     lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000, seed=0),
     lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1,
-                    exploration_fraction=0.001, env=e).learn(total_timesteps=30000, seed=0),
+                    exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0),
     lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7,
                    optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
     lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3,