Merge pull request #15 from hill-a/doc-env

Documentation Custom Environments
Stable-Baselines-Team · Sep 2, 2018 · 0b128fd · 0b128fd
2 parents 765bbf5 + 50ea0ea
commit 0b128fd
Show file tree

Hide file tree

Showing 33 changed files with 231 additions and 134 deletions.
diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,25 @@
+## Stable Baselines Documentation
+
+This folder contains documentation for the RL baselines.
+
+
+### Build the Documentation
+
+#### Install Sphinx and Theme
+
+```
+pip install sphinx sphinx-autobuild sphinx-rtd-theme
+```
+
+#### Building the Docs
+
+In the `docs/` folder:
+```
+make html
+```
+
+if you want to building each time a file is changed:
+
+```
+sphinx-autobuild . _build/html
+```
diff --git a/docs/conf.py b/docs/conf.py
@@ -31,7 +31,7 @@ def __getattr__(cls, name):
                 'pandas', 'mpi4py', 'mujoco-py', 'cv2', 'tensorflow',
                 'tensorflow.contrib', 'tensorflow.contrib.layers',
                 'tensorflow.python', 'tensorflow.python.client', 'tensorflow.python.ops',
-                'tqdm', 'cloudpickle', 'matplotlib',
+                'tqdm', 'cloudpickle', 'matplotlib', 'matplotlib.pyplot',
                 'seaborn', 'gym', 'gym.spaces', 'gym.core',
                 'gym.wrappers', 'zmq']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst
@@ -0,0 +1,57 @@
+.. _custom_env:
+
+Using Custom Environments
+==========================
+
+To use the rl baselines with custom environments, they just need to follow the *gym* interface.
+That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class):
+
+
+.. code-block:: python
+
+  import gym
+  from gym import spaces
+
+  class CustomEnv(gym.Env):
+    """Custom Environment that follows gym interface"""
+    metadata = {'render.modes': ['human']}
+
+    def __init__(self, arg1, arg2, ...):
+      super(CustomEnv, self).__init__()
+      # Define action and observation space
+      # They must be gym.spaces objects
+      # Example when using discrete actions:
+      self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
+      # Example for using image as input:
+      self.observation_space = spaces.Box(low=0, high=255,
+                                          shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
+
+    def step(self, action):
+      ...
+    def reset(self):
+      ...
+    def render(self, mode='human', close=False):
+      ...
+
+
+Then you can define and train a RL agent with:
+
+.. code-block:: python
+
+  # Instantiate and wrap the env
+  env = DummyVecEnv([lambda: CustomEnv(arg1, ...)])
+  # Define and Train the agent
+  model = A2C(CnnPolicy, env).learn(total_timesteps=1000)
+
+
+You can find a `complete guide online <https://github.com/openai/gym/tree/master/gym/envs#how-to-create-new-environments-for-gym>`_
+on creating a custom Gym environment.
+
+
+Optionnaly, you can also register the environment with gym,
+that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env).
+
+
+In the project, for testing purposes, we use a custom environment named ``IdentityEnv``
+defined `in this file <https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/identity_env.py>`_.
+An example of how to use it can be found `here <https://github.com/hill-a/stable-baselines/blob/master/tests/test_identity.py>`_.
diff --git a/docs/guide/install.rst b/docs/guide/install.rst
@@ -1,6 +1,5 @@
 .. _install:
 
-============
 Installation
 ============
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -7,7 +7,7 @@ Welcome to Stable Baselines docs!
 ============================================
 
 `Stable Baselines <https://github.com/hill-a/stable-baselines>`_ is a set of improved implementations
-of reinforcement learning algorithms based on OpenAI `Baselines <https://github.com/openai/baselines>`_.
+of Reinforcement Learning (RL) algorithms based on OpenAI `Baselines <https://github.com/openai/baselines>`_.
 
 Github repository: https://github.com/hill-a/stable-baselines
 
@@ -37,6 +37,7 @@ This toolset is a fork of OpenAI Baselines, with a major structural refactoring,
    guide/algos
    guide/examples
    guide/vec_envs
+   guide/custom_env
 
 
 .. toctree::
@@ -69,7 +70,8 @@ This toolset is a fork of OpenAI Baselines, with a major structural refactoring,
   :maxdepth: 1
   :caption: Misc
 
-  changelog
+  misc/changelog
+  misc/results_plotter
 
 
 Citing Stable Baselines
@@ -90,7 +92,10 @@ To cite this project in publications:
 Contributing
 ------------
 
-To any interested in making the baselines better, there is still some documentation/improvements that needs to be done.
+To any interested in making the rl baselines better, there is still some improvements
+that needs to be done: good-to-have features like tensorboard integration or support for continuous actions (ACER)
+and more documentation on the rl algorithms.
+
 If you want to contribute, please open an issue first and then propose your pull request on Github at
 https://github.com/hill-a/stable-baselines.
 

diff --git a/docs/changelog.rst → docs/misc/changelog.rst b/docs/changelog.rst → docs/misc/changelog.rst
@@ -1,21 +1,21 @@
 .. _changelog:
 
-==========
 Changelog
 ==========
 
 For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.
 
 Master version 1.0.8.a0 (WIP)
-=============================
+-----------------------------
 
 - added more documentation (some modules from common).
+- added doc about using custom env
 
 Tensorboard support in progress (see ``tensorboard`` branch)
 
 
 Release 1.0.7 (2018-08-29)
-===========================
+--------------------------
 
 **Bug fixes and documentation**
 
@@ -26,7 +26,7 @@ Release 1.0.7 (2018-08-29)
 
 
 Release 1.0.1 (2018-08-20)
-==========================
+--------------------------
 
 **Refactored Stable Baselines**
 
@@ -60,7 +60,7 @@ Future Work :
 
 
 Release 0.1.6 (2018-07-27)
-==========================
+--------------------------
 
 **Deobfuscation of the code base + pep8 and fixes**
 

diff --git a/docs/misc/results_plotter.rst b/docs/misc/results_plotter.rst
@@ -0,0 +1,8 @@
+.. _results_plotter:
+
+
+Plotting Results
+================
+
+.. automodule:: stable_baselines.results_plotter
+  :members:
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
@@ -65,4 +65,3 @@ Parameters
 
 .. autoclass:: HER
   :members:
-	:inherited-members:
diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
@@ -462,10 +462,10 @@ def predict(self, observation, state=None, mask=None):
         """
         Get the model's action from an observation
 
-        :param observation: (numpy Number) the input observation
-        :param state: (numpy Number) The last states (can be None, used in reccurent policies)
-        :param mask: (numpy Number) The last masks (can be None, used in reccurent policies)
-        :return: (numpy Number, numpy Number) the model's action and the next state (used in reccurent policies)
+        :param observation: (np.ndarray) the input observation
+        :param state: (np.ndarray) The last states (can be None, used in reccurent policies)
+        :param mask: (np.ndarray) The last masks (can be None, used in reccurent policies)
+        :return: (np.ndarray, np.ndarray) the model's action and the next state (used in reccurent policies)
         """
         if state is None:
             state = self.initial_state
@@ -481,10 +481,10 @@ def action_probability(self, observation, state=None, mask=None):
         """
         Get the model's action probability distribution from an observation
 
-        :param observation: (numpy Number) the input observation
-        :param state: (numpy Number) The last states (can be None, used in reccurent policies)
-        :param mask: (numpy Number) The last masks (can be None, used in reccurent policies)
-        :return: (numpy Number) the model's action probability distribution
+        :param observation: (np.ndarray) the input observation
+        :param state: (np.ndarray) The last states (can be None, used in reccurent policies)
+        :param mask: (np.ndarray) The last masks (can be None, used in reccurent policies)
+        :return: (np.ndarray) the model's action probability distribution
         """
         if state is None:
             state = self.initial_state

diff --git a/stable_baselines/acer/buffer.py b/stable_baselines/acer/buffer.py
@@ -123,7 +123,7 @@ def take(self, arr, idx, envx):
         """
         Reads a frame from a list and index for the asked environment ids
         
-        :param arr: (numpy array) the array that is read
+        :param arr: (np.ndarray) the array that is read
         :param idx: ([int]) the idx that are read
         :param envx: ([int]) the idx for the environments
         :return: ([float]) the askes frames from the list

diff --git a/stable_baselines/common/atari_wrappers.py b/stable_baselines/common/atari_wrappers.py
@@ -248,7 +248,7 @@ def __init__(self, frames):
         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
         buffers.
 
-        This object should only be converted to numpy array before being passed to the model.
+        This object should only be converted to np.ndarray before being passed to the model.
 
         :param frames: ([int] or [float]) environment frames
         """

diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py
@@ -146,10 +146,10 @@ def predict(self, observation, state=None, mask=None):
         """
         Get the model's action from an observation
 
-        :param observation: (numpy Number) the input observation
-        :param state: (numpy Number) The last states (can be None, used in reccurent policies)
-        :param mask: (numpy Number) The last masks (can be None, used in reccurent policies)
-        :return: (numpy Number, numpy Number) the model's action and the next state (used in reccurent policies)
+        :param observation: (np.ndarray) the input observation
+        :param state: (np.ndarray) The last states (can be None, used in reccurent policies)
+        :param mask: (np.ndarray) The last masks (can be None, used in reccurent policies)
+        :return: (np.ndarray, np.ndarray) the model's action and the next state (used in reccurent policies)
         """
         pass
 
@@ -158,10 +158,10 @@ def action_probability(self, observation, state=None, mask=None):
         """
         Get the model's action probability distribution from an observation
 
-        :param observation: (numpy Number) the input observation
-        :param state: (numpy Number) The last states (can be None, used in reccurent policies)
-        :param mask: (numpy Number) The last masks (can be None, used in reccurent policies)
-        :return: (numpy Number) the model's action probability distribution
+        :param observation: (np.ndarray) the input observation
+        :param state: (np.ndarray) The last states (can be None, used in reccurent policies)
+        :param mask: (np.ndarray) The last masks (can be None, used in reccurent policies)
+        :return: (np.ndarray) the model's action probability distribution
         """
         pass
 

diff --git a/stable_baselines/common/filters.py b/stable_baselines/common/filters.py
@@ -7,7 +7,7 @@
 
 class Filter(object):
     """
-    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+    takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
     Can pass kwarg: 'update' (bool) if the filter can update from the value
     """
@@ -34,7 +34,7 @@ class IdentityFilter(Filter):
     """
     A filter that implements an identity function
 
-    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+    takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
     Can pass kwarg: 'update' (bool) if the filter can update from the value
     """
@@ -50,7 +50,7 @@ def __init__(self, functions):
         """
         A filter that implements a composition with other functions
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value
 
@@ -77,7 +77,7 @@ def __init__(self, shape, demean=True, destd=True, clip=10.0):
         y = (x-mean)/std
         using running estimates of mean,std
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value
 
@@ -112,7 +112,7 @@ def __init__(self):
         """
         A filter that appends a counter to the input
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value
         """
@@ -132,7 +132,7 @@ class FlattenFilter(Filter):
     """
     A filter that flattens the input
 
-    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+    takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
     Can pass kwarg: 'update' (bool) if the filter can update from the value
     """
@@ -148,7 +148,7 @@ def __init__(self, n_cat):
         """
         A filter that turns indices to onehot encoding
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value
 
@@ -170,7 +170,7 @@ def __init__(self, divisor):
         """
         A filter that divides the input from a value
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value
 
@@ -190,7 +190,7 @@ def __init__(self, length):
         """
         A filter that runs a stacking of a 'length' inputs
 
-        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+        takes a value 'x' (np.ndarray), applies the filter, and returns the new value.
 
         Can pass kwarg: 'update' (bool) if the filter can update from the value