Unity-Technologies · ervteng · Jul 3, 2019 · Oct 9, 2018 · Oct 12, 2018 · Oct 12, 2018
diff --git a/config/trainer_config.yaml b/config/trainer_config.yaml
@@ -4,7 +4,6 @@ default:
     beta: 5.0e-3
     buffer_size: 10240
     epsilon: 0.2
-    gamma: 0.99
     hidden_units: 128
     lambd: 0.95
     learning_rate: 3.0e-4
@@ -17,14 +16,15 @@ default:
     sequence_length: 64
     summary_freq: 1000
     use_recurrent: false
-    use_curiosity: false
-    curiosity_strength: 0.01
-    curiosity_enc_size: 128
+    reward_signals: 
+        extrinsic:
+            strength: 1.0
+            gamma: 0.99
 
 BananaLearning:
     normalize: false
-    batch_size: 1024
     beta: 5.0e-3
+    batch_size: 1024
     buffer_size: 10240
     max_steps: 1.0e5
 
@@ -93,9 +93,7 @@ GoalieLearning:
     normalize: false
 
 PyramidsLearning:
-    use_curiosity: true
     summary_freq: 2000
-    curiosity_strength: 0.01
     curiosity_enc_size: 256
     time_horizon: 128
     batch_size: 128
@@ -105,11 +103,18 @@ PyramidsLearning:
     beta: 1.0e-2
     max_steps: 5.0e5
     num_epoch: 3
-
+    sequence_length: 16
+    use_recurrent: false
+    reward_signals:
+        extrinsic: 
+            strength: 1.0
+            gamma: 0.99
+        curiosity:
+            strength: 0.01
+            gamma: 0.99
+            encoding_size: 128
+
 VisualPyramidsLearning:
-    use_curiosity: true
-    curiosity_strength: 0.01
-    curiosity_enc_size: 256
     time_horizon: 128
     batch_size: 64
     buffer_size: 2024
@@ -118,6 +123,14 @@ VisualPyramidsLearning:
     beta: 1.0e-2
     max_steps: 5.0e5
     num_epoch: 3
+    reward_signals:
+        extrinsic: 
+            strength: 1.0
+            gamma: 0.99
+        curiosity:
+            strength: 0.01
+            gamma: 0.99
+            encoding_size: 256
 
 3DBallLearning:
     normalize: true
@@ -126,9 +139,7 @@ VisualPyramidsLearning:
     summary_freq: 1000
     time_horizon: 1000
     lambd: 0.99
-    gamma: 0.995
     beta: 0.001
-    use_curiosity: true
 
 3DBallHardLearning:
     normalize: true
@@ -137,8 +148,11 @@ VisualPyramidsLearning:
     summary_freq: 1000
     time_horizon: 1000
     max_steps: 5.0e5
-    gamma: 0.995
     beta: 0.001
+    reward_signals:
+        extrinsic: 
+            strength: 1.0
+            gamma: 0.995
 
 TennisLearning:
     normalize: true
@@ -150,19 +164,21 @@ CrawlerStaticLearning:
     time_horizon: 1000
     batch_size: 2024
     buffer_size: 20240
-    gamma: 0.995
     max_steps: 1e6
     summary_freq: 3000
     num_layers: 3
     hidden_units: 512
+    reward_signals:
+        extrinsic: 
+            strength: 1.0
+            gamma: 0.995
 
 CrawlerDynamicLearning:
     normalize: true
     num_epoch: 3
     time_horizon: 1000
     batch_size: 2024
     buffer_size: 20240
-    gamma: 0.995
     max_steps: 1e6
     summary_freq: 3000
     num_layers: 3
@@ -174,11 +190,14 @@ WalkerLearning:
     time_horizon: 1000
     batch_size: 2048
     buffer_size: 20480
-    gamma: 0.995
     max_steps: 2e6
     summary_freq: 3000
     num_layers: 3
     hidden_units: 512
+    reward_signals:
+        extrinsic: 
+            strength: 1.0
+            gamma: 0.995
 
 ReacherLearning:
     normalize: true
@@ -197,7 +216,6 @@ HallwayLearning:
     hidden_units: 128
     memory_size: 256
     beta: 1.0e-2
-    gamma: 0.99
     num_epoch: 3
     buffer_size: 1024
     batch_size: 128

diff --git a/docs/Training-PPO.md b/docs/Training-PPO.md
@@ -7,6 +7,10 @@ observations to the best action an agent can take in a given state. The
 ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate
 Python process (communicating with the running Unity application over a socket).
 
+To train an agent, you will need to provide the agent one or more reward signals which
+the agent should attempt to maximize. See [Reward Signals](Training-RewardSignals.md)
+for the available reward signals and the corresponding hyperparameters.
+
 See [Training ML-Agents](Training-ML-Agents.md) for instructions on running the
 training program, `learn.py`.
 
@@ -31,15 +35,18 @@ of performance you would like.
 
 ## Hyperparameters
 
-### Gamma
+### Reward Signals
 
-`gamma` corresponds to the discount factor for future rewards. This can be
-thought of as how far into the future the agent should care about possible
-rewards. In situations when the agent should be acting in the present in order
-to prepare for rewards in the distant future, this value should be large. In
-cases when rewards are more immediate, it can be smaller.
+In reinforcement learning, the goal is to learn a Policy that maximizes reward.
+At a base level, the reward is given by the environment. However, we could imagine
+rewarding the agent for various different behaviors. For instance, we could reward
+the agent for exploring new states, rather than just when an explicit reward is given.
+Furthermore, we could mix reward signals to help the learning process.
 
-Typical Range: `0.8` - `0.995`
+`reward_signals` provides a section to define [reward signals.](Training-RewardSignals.md)
+ML-Agents provides two reward signals by default, the Extrinsic (environment) reward, and the
+Curiosity reward, which can be used to encourage exploration in sparse extrinsic reward
+environments. 
 
 ### Lambda
 
@@ -184,30 +191,6 @@ the agent will need to remember in order to successfully complete the task.
 
 Typical Range: `64` - `512`
 
-## (Optional) Intrinsic Curiosity Module Hyperparameters
-
-The below hyperparameters are only used when `use_curiosity` is set to true.
-
-### Curiosity Encoding Size
-
-`curiosity_enc_size` corresponds to the size of the hidden layer used to encode
-the observations within the intrinsic curiosity module. This value should be
-small enough to encourage the curiosity module to compress the original
-observation, but also not too small to prevent it from learning the dynamics of
-the environment.
-
-Typical Range: `64` - `256`
-
-### Curiosity Strength
-
-`curiosity_strength` corresponds to the magnitude of the intrinsic reward
-generated by the intrinsic curiosity module. This should be scaled in order to
-ensure it is large enough to not be overwhelmed by extrinsic reward signals in
-the environment. Likewise it should not be too large to overwhelm the extrinsic
-reward signal.
-
-Typical Range: `0.1` - `0.001`
-
 ## Training Statistics
 
 To view training statistics, use TensorBoard. For information on launching and

diff --git a/docs/Training-RewardSignals.md b/docs/Training-RewardSignals.md
@@ -0,0 +1,100 @@
+# Reward Signals
+
+In reinforcement learning, the end goal for the Agent is to discover a behavior (a Policy)
+that maximizes a reward. Typically, a reward is defined by your environment, and corresponds 
+to reaching some goal. These are what we refer to as "extrinsic" rewards, as they are defined
+external of the learning algorithm. 
+
+Rewards, however, can be defined outside of the enviroment as well, to encourage the agent to 
+behave in certain ways, or to aid the learning of the true extrinsic reward. We refer to these
+rewards as "intrinsic" reward signals. The total reward that the agent attempts to maximize can 
+be a mix of extrinsic and intrinsic reward signals. 
+
+ML-Agents allows reward signals to be defined in a modular way, and we provide three reward 
+signals that can the mixed and matched to help shape your agent's behavior. The `extrinsic` Reward Signal represents the rewards defined in your environment, and is enabled by default. 
+The `curiosity` reward signal helps your agent explore when extrinsic rewards are sparse.
+
+## Enabling Reward Signals 
+
+Reward signals, like other hyperparameters, are defined in the trainer config `.yaml` file. An
+example is provided in `config/trainer_config.yaml`. To enable a reward signal, add it to the 
+`reward_signals:` section under the brain name. For instance, to enable the extrinsic signal
+in addition to a small curiosity reward, you would define your `reward_signals` as follows:
+
+```
+reward_signals:
+    extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    curiosity:
+        strength: 0.01
+        gamma: 0.99
+        encoding_size: 128
+```
+
+Each reward signal should define at least two parameters, `strength` and `gamma`, in addition 
+to any class-specific hyperparameters. Note that to remove a reward signal, you should delete 
+its entry entirely from `reward_signals`. At least one reward signal should be left defined
+at all times. 
+
+## Reward Signal Types
+
+### The Extrinsic Reward Signal
+
+The `extrinsic` reward signal is simply the reward given by the
+[environment](Learning-Environment-Design.md). Remove it to force the agent
+to ignore the environment reward. 
+
+#### Strength 
+
+`strength` is the factor by which to multiply the raw 
+reward. Typical ranges will vary depending on the reward signal. 
+
+Typical Range: `0.01 - `1.0`
+
+#### Gamma
+
+`gamma` corresponds to the discount factor for future rewards. This can be
+thought of as how far into the future the agent should care about possible
+rewards. In situations when the agent should be acting in the present in order
+to prepare for rewards in the distant future, this value should be large. In
+cases when rewards are more immediate, it can be smaller.
+
+Typical Range: `0.8` - `0.995`
+
+### The Curiosity Reward Signal
+
+@chriselion
+
+#### Strength 
+
+In this case, `strength` corresponds to the magnitude of the curiosity reward generated 
+by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough 
+to not be overwhelmed by extrinsic reward signals in the environment. 
+Likewise it should not be too large to overwhelm the extrinsic reward signal.
+
+Typical Range: `0.1 - `0.001`
+
+#### Gamma
+
+`gamma` corresponds to the discount factor for future rewards. 
+
+Typical Range: `0.8` - `0.9`
+
+#### Encoding Size
+
+`encoding_size` corresponds to the size of the encoding used by the intrinsic curiosity model. 
+This value should be small enough to encourage the ICM to compress the original
+observation, but also not too small to prevent it from learning to differentiate between 
+demonstrated and actual behavior. 
+
+Default Value: 64
+Typical Range: `64` - `256`
+
+#### Learning Rate
+
+`learning_rate` is the learning rate used to update the intrinsic curiosity module. 
+This should typically be decreased if training is unstable, and the curiosity loss is unstable.
+
+Default Value: `3e-4`
+Typical Range: `1e-5` - `1e-3`  
diff --git a/ml-agents/mlagents/trainers/bc/policy.py b/ml-agents/mlagents/trainers/bc/policy.py
@@ -57,7 +57,7 @@ def evaluate(self, brain_info):
             self.model.sequence_length: 1,
         }
 
-        feed_dict = self._fill_eval_dict(feed_dict, brain_info)
+        feed_dict = self.fill_eval_dict(feed_dict, brain_info)
         if self.use_recurrent:
             if brain_info.memories.shape[1] == 0:
                 brain_info.memories = self.make_empty_memory(len(brain_info.agents))

diff --git a/ml-agents/mlagents/trainers/components/__init__.py b/ml-agents/mlagents/trainers/components/__init__.py
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
@@ -0,0 +1 @@
+from .reward_signal import *
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/__init__.py
@@ -0,0 +1 @@
+from .signal import CuriosityRewardSignal