diff --git a/demos/Expert3DBall.demo b/demos/Expert3DBall.demo new file mode 100644 index 0000000000..873e1770a8 Binary files /dev/null and b/demos/Expert3DBall.demo differ diff --git a/demos/Expert3DBallHard.demo b/demos/Expert3DBallHard.demo new file mode 100644 index 0000000000..3130d251ca Binary files /dev/null and b/demos/Expert3DBallHard.demo differ diff --git a/demos/ExpertBanana.demo b/demos/ExpertBanana.demo new file mode 100644 index 0000000000..33c86abd14 Binary files /dev/null and b/demos/ExpertBanana.demo differ diff --git a/demos/ExpertBasic.demo b/demos/ExpertBasic.demo new file mode 100644 index 0000000000..6c1c962f98 Binary files /dev/null and b/demos/ExpertBasic.demo differ diff --git a/demos/ExpertBouncer.demo b/demos/ExpertBouncer.demo new file mode 100644 index 0000000000..2ab16a9666 Binary files /dev/null and b/demos/ExpertBouncer.demo differ diff --git a/demos/ExpertCrawlerDyn.demo b/demos/ExpertCrawlerDyn.demo new file mode 100644 index 0000000000..04736d4312 Binary files /dev/null and b/demos/ExpertCrawlerDyn.demo differ diff --git a/demos/ExpertCrawlerSta.demo b/demos/ExpertCrawlerSta.demo new file mode 100644 index 0000000000..9001d074ef Binary files /dev/null and b/demos/ExpertCrawlerSta.demo differ diff --git a/demos/ExpertGrid.demo b/demos/ExpertGrid.demo new file mode 100644 index 0000000000..65f37610fd Binary files /dev/null and b/demos/ExpertGrid.demo differ diff --git a/demos/ExpertHallway.demo b/demos/ExpertHallway.demo new file mode 100644 index 0000000000..ee6de388cb Binary files /dev/null and b/demos/ExpertHallway.demo differ diff --git a/demos/ExpertPush.demo b/demos/ExpertPush.demo new file mode 100644 index 0000000000..4184685ff4 Binary files /dev/null and b/demos/ExpertPush.demo differ diff --git a/demos/ExpertPyramid.demo b/demos/ExpertPyramid.demo new file mode 100644 index 0000000000..c34c60c2ee Binary files /dev/null and b/demos/ExpertPyramid.demo differ diff --git a/demos/ExpertReacher.demo b/demos/ExpertReacher.demo new file mode 100644 index 0000000000..c32c6c7f02 Binary files /dev/null and b/demos/ExpertReacher.demo differ diff --git a/demos/ExpertSoccerGoal.demo b/demos/ExpertSoccerGoal.demo new file mode 100644 index 0000000000..eaad1ec561 Binary files /dev/null and b/demos/ExpertSoccerGoal.demo differ diff --git a/demos/ExpertSoccerStri.demo b/demos/ExpertSoccerStri.demo new file mode 100644 index 0000000000..ca7a3afeb3 Binary files /dev/null and b/demos/ExpertSoccerStri.demo differ diff --git a/demos/ExpertTennis.demo b/demos/ExpertTennis.demo new file mode 100644 index 0000000000..66658cebf8 Binary files /dev/null and b/demos/ExpertTennis.demo differ diff --git a/demos/ExpertWalker.demo b/demos/ExpertWalker.demo new file mode 100644 index 0000000000..875608f046 Binary files /dev/null and b/demos/ExpertWalker.demo differ diff --git a/docs/Training-BehavioralCloning.md b/docs/Training-BehavioralCloning.md new file mode 100644 index 0000000000..427c8db515 --- /dev/null +++ b/docs/Training-BehavioralCloning.md @@ -0,0 +1,92 @@ +# Training with Behavioral Cloning + +There are a variety of possible imitation learning algorithms which can +be used, the simplest one of them is Behavioral Cloning. It works by collecting +demonstrations from a teacher, and then simply uses them to directly learn a +policy, in the same way the supervised learning for image classification +or other traditional Machine Learning tasks work. + +## Offline Training + +With offline behavioral cloning, we can use demonstrations (`.demo` files) +generated using the `Demonstration Recorder` as the dataset used to train a behavior. + +1. Choose an agent you would like to learn to imitate some set of demonstrations. +2. Record a set of demonstration using the `Demonstration Recorder` (see [here](Training-Imitation-Learning.md)). + For illustrative purposes we will refer to this file as `AgentRecording.demo`. +3. Build the scene, assigning the agent a Learning Brain, and set the Brain to + Control in the Broadcast Hub. For more information on Brains, see + [here](Learning-Environment-Design-Brains.md). +4. Open the `config/offline_bc_config.yaml` file. +5. Modify the `demo_path` parameter in the file to reference the path to the + demonstration file recorded in step 2. In our case this is: + `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` +6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` + as the config parameter, and include the `--run-id` and `--train` as usual. + Provide your environment as the `--env` parameter if it has been compiled + as standalone, or omit to train in the editor. +7. (Optional) Observe training performance using TensorBoard. + +This will use the demonstration file to train a neural network driven agent +to directly imitate the actions provided in the demonstration. The environment +will launch and be used for evaluating the agent's performance during training. + +## Online Training + +It is also possible to provide demonstrations in realtime during training, +without pre-recording a demonstration file. The steps to do this are as follows: + +1. First create two Brains, one which will be the "Teacher," and the other which + will be the "Student." We will assume that the names of the Brain + Assets are "Teacher" and "Student" respectively. +2. The "Teacher" Brain must be a **Player Brain**. You must properly + configure the inputs to map to the corresponding actions. +3. The "Student" Brain must be a **Learning Brain**. +4. The Brain Parameters of both the "Teacher" and "Student" Brains must be + compatible with the agent. +5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub` + and check the `Control` checkbox on the "Student" Brain. +6. Link the Brains to the desired Agents (one Agent as the teacher and at least + one Agent as a student). +7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set + the `trainer` parameter of this entry to `online_bc`, and the + `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher". + Additionally, set `batches_per_epoch`, which controls how much training to do + each moment. Increase the `max_steps` option if you'd like to keep training + the Agents for a longer period of time. +8. Launch the training process with `mlagents-learn config/online_bc_config.yaml + --train --slow`, and press the :arrow_forward: button in Unity when the + message _"Start training by pressing the Play button in the Unity Editor"_ is + displayed on the screen +9. From the Unity window, control the Agent with the Teacher Brain by providing + "teacher demonstrations" of the behavior you would like to see. +10. Watch as the Agent(s) with the student Brain attached begin to behave + similarly to the demonstrations. +11. Once the Student Agents are exhibiting the desired behavior, end the training + process with `CTL+C` from the command line. +12. Move the resulting `*.nn` file into the `TFModels` subdirectory of the + Assets folder (or a subdirectory within Assets of your choosing) , and use + with `Learning` Brain. + +**BC Teacher Helper** + +We provide a convenience utility, `BC Teacher Helper` component that you can add +to the Teacher Agent. + +
+ +
+ +This utility enables you to use keyboard shortcuts to do the following: + +1. To start and stop recording experiences. This is useful in case you'd like to + interact with the game _but not have the agents learn from these + interactions_. The default command to toggle this is to press `R` on the + keyboard. + +2. Reset the training buffer. This enables you to instruct the agents to forget + their buffer of recent experiences. This is useful if you'd like to get them + to quickly learn a new behavior. The default command to reset the buffer is + to press `C` on the keyboard. diff --git a/docs/Training-Imitation-Learning.md b/docs/Training-Imitation-Learning.md index 027564cbb6..0e2221b774 100644 --- a/docs/Training-Imitation-Learning.md +++ b/docs/Training-Imitation-Learning.md @@ -10,6 +10,35 @@ from the game and actions from a game controller to guide the medic's behavior. Imitation Learning uses pairs of observations and actions from from a demonstration to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs). +Imitation learning can also be used to help reinforcement learning. Especially in +environments with sparse (i.e., infrequent or rare) rewards, the agent may never see +the reward and thus not learn from it. Curiosity helps the agent explore, but in some cases +it is easier to just show the agent how to achieve the reward. In these cases, +imitation learning can dramatically reduce the time it takes to solve the environment. +For instance, on the [Pyramids environment](Learning-Environment-Examples.md#pyramids), +just 6 episodes of demonstrations can reduce training steps by more than 4 times. + ++ +
+ +ML-Agents provides several ways to learn from demonstrations. For most situations, +[GAIL](Training-RewardSignals.md#the-gail-reward-signal) is the preferred approach. + +* To train using GAIL (Generative Adversarial Imitaiton Learning) you can add the + [GAIL reward signal](Training-RewardSignals.md#the-gail-reward-signal). GAIL can be + used with or without environment rewards, and works well when there are a limited + number of demonstrations. +* To help bootstrap reinforcement learning, you can enable + [pretraining](Training-PPO.md#optional-pretraining-using-demonstrations) + on the PPO trainer, in addition to using a small GAIL reward signal. +* To train an agent to exactly mimic demonstrations, you can use the + [Behavioral Cloning](Training-BehavioralCloning.md) trainer. Behavioral Cloning can be + used offline and online (in-editor), and learns very quickly. However, it usually is ineffective + on more complex environments without a large number of demonstrations. + ## Recording Demonstrations It is possible to record demonstrations of agent behavior from the Unity Editor, @@ -43,98 +72,4 @@ inspector. alt="BC Teacher Helper" width="375" border="10" /> - - -## Training with Behavioral Cloning - -There are a variety of possible imitation learning algorithms which can -be used, the simplest one of them is Behavioral Cloning. It works by collecting -demonstrations from a teacher, and then simply uses them to directly learn a -policy, in the same way the supervised learning for image classification -or other traditional Machine Learning tasks work. - - -### Offline Training - -With offline behavioral cloning, we can use demonstrations (`.demo` files) -generated using the `Demonstration Recorder` as the dataset used to train a behavior. - -1. Choose an agent you would like to learn to imitate some set of demonstrations. -2. Record a set of demonstration using the `Demonstration Recorder` (see above). - For illustrative purposes we will refer to this file as `AgentRecording.demo`. -3. Build the scene, assigning the agent a Learning Brain, and set the Brain to - Control in the Broadcast Hub. For more information on Brains, see - [here](Learning-Environment-Design-Brains.md). -4. Open the `config/offline_bc_config.yaml` file. -5. Modify the `demo_path` parameter in the file to reference the path to the - demonstration file recorded in step 2. In our case this is: - `./UnitySDK/Assets/Demonstrations/AgentRecording.demo` -6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml` - as the config parameter, and include the `--run-id` and `--train` as usual. - Provide your environment as the `--env` parameter if it has been compiled - as standalone, or omit to train in the editor. -7. (Optional) Observe training performance using TensorBoard. - -This will use the demonstration file to train a neural network driven agent -to directly imitate the actions provided in the demonstration. The environment -will launch and be used for evaluating the agent's performance during training. - -### Online Training - -It is also possible to provide demonstrations in realtime during training, -without pre-recording a demonstration file. The steps to do this are as follows: - -1. First create two Brains, one which will be the "Teacher," and the other which - will be the "Student." We will assume that the names of the Brain - Assets are "Teacher" and "Student" respectively. -2. The "Teacher" Brain must be a **Player Brain**. You must properly - configure the inputs to map to the corresponding actions. -3. The "Student" Brain must be a **Learning Brain**. -4. The Brain Parameters of both the "Teacher" and "Student" Brains must be - compatible with the agent. -5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub` - and check the `Control` checkbox on the "Student" Brain. -6. Link the Brains to the desired Agents (one Agent as the teacher and at least - one Agent as a student). -7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set - the `trainer` parameter of this entry to `online_bc`, and the - `brain_to_imitate` parameter to the name of the teacher Brain: "Teacher". - Additionally, set `batches_per_epoch`, which controls how much training to do - each moment. Increase the `max_steps` option if you'd like to keep training - the Agents for a longer period of time. -8. Launch the training process with `mlagents-learn config/online_bc_config.yaml - --train --slow`, and press the :arrow_forward: button in Unity when the - message _"Start training by pressing the Play button in the Unity Editor"_ is - displayed on the screen -9. From the Unity window, control the Agent with the Teacher Brain by providing - "teacher demonstrations" of the behavior you would like to see. -10. Watch as the Agent(s) with the student Brain attached begin to behave - similarly to the demonstrations. -11. Once the Student Agents are exhibiting the desired behavior, end the training - process with `CTL+C` from the command line. -12. Move the resulting `*.nn` file into the `TFModels` subdirectory of the - Assets folder (or a subdirectory within Assets of your choosing) , and use - with `Learning` Brain. - -**BC Teacher Helper** - -We provide a convenience utility, `BC Teacher Helper` component that you can add -to the Teacher Agent. - -- -
- -This utility enables you to use keyboard shortcuts to do the following: - -1. To start and stop recording experiences. This is useful in case you'd like to - interact with the game _but not have the agents learn from these - interactions_. The default command to toggle this is to press `R` on the - keyboard. - -2. Reset the training buffer. This enables you to instruct the agents to forget - their buffer of recent experiences. This is useful if you'd like to get them - to quickly learn a new behavior. The default command to reset the buffer is - to press `C` on the keyboard. + \ No newline at end of file diff --git a/docs/Training-PPO.md b/docs/Training-PPO.md index 02fabc40fe..b458d60890 100644 --- a/docs/Training-PPO.md +++ b/docs/Training-PPO.md @@ -22,8 +22,7 @@ If you are using curriculum training to pace the difficulty of the learning task presented to an agent, see [Training with Curriculum Learning](Training-Curriculum-Learning.md). -For information about imitation learning, which uses a different training -algorithm, see +For information about imitation learning from demonstrations, see [Training with Imitation Learning](Training-Imitation-Learning.md). ## Best Practices when training with PPO @@ -191,6 +190,73 @@ the agent will need to remember in order to successfully complete the task. Typical Range: `64` - `512` +## (Optional) Pretraining Using Demonstrations + +In some cases, you might want to bootstrap the agent's policy using behavior recorded +from a player. This can help guide the agent towards the reward. Pretraining adds +training operations that mimic a demonstration rather than attempting to maximize reward. +It is essentially equivalent to running [behavioral cloning](./Training-BehavioralCloning.md) +in-line with PPO. + +To use pretraining, add a `pretraining` section to the trainer_config. For instance: + +``` + pretraining: + demo_path: ./demos/ExpertPyramid.demo + strength: 0.5 + steps: 10000 +``` + +Below are the avaliable hyperparameters for pretraining. + +### Strength + +`strength` corresponds to the learning rate of the imitation relative to the learning +rate of PPO, and roughly corresponds to how strongly we allow the behavioral cloning +to influence the policy. + +Typical Range: `0.1` - `0.5` + +### Demo Path + +`demo_path` is the path to your `.demo` file or directory of `.demo` files. +See the [imitation learning guide](Training-ImitationLearning.md) for more on `.demo` files. + +### Steps + +During pretraining, it is often desirable to stop using demonstrations after the agent has +"seen" rewards, and allow it to optimize past the available demonstrations and/or generalize +outside of the provided demonstrations. `steps` corresponds to the training steps over which +pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set +the steps to 0 for constant imitation over the entire training run. + +### (Optional) Batch Size + +`batch_size` is the number of demonstration experiences used for one iteration of a gradient +descent update. If not specified, it will default to the `batch_size` defined for PPO. + +Typical Range (Continuous): `512` - `5120` + +Typical Range (Discrete): `32` - `512` + +### (Optional) Number of Epochs + +`num_epoch` is the number of passes through the experience buffer during +gradient descent. If not specified, it will default to the number of epochs set for PPO. + +Typical Range: `3` - `10` + +### (Optional) Samples Per Update + +`samples_per_update` is the maximum number of samples +to use during each imitation update. You may want to lower this if your demonstration +dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 +to train over all of the demonstrations at each update step. + +Default Value: `0` (all) + +Typical Range: Approximately equal to PPO's `buffer_size` + ## Training Statistics To view training statistics, use TensorBoard. For information on launching and diff --git a/docs/Training-RewardSignals.md b/docs/Training-RewardSignals.md index b0e99e9930..04e61912ed 100644 --- a/docs/Training-RewardSignals.md +++ b/docs/Training-RewardSignals.md @@ -102,7 +102,8 @@ This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. -Default Value: 64 +Default Value: `64` + Typical Range: `64` - `256` #### Learning Rate @@ -111,4 +112,100 @@ Typical Range: `64` - `256` This should typically be decreased if training is unstable, and the curiosity loss is unstable. Default Value: `3e-4` + +Typical Range: `1e-5` - `1e-3` + +### The GAIL Reward Signal + +GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an +imitation learning algorithm that uses an adversarial approach, in a similar vein to GANs +(Generative Adversarial Networks). In this framework, a second neural network, the +discriminator, is taught to distinguish whether an observation/action is from a demonstration, or +produced by the agent. This discriminator can the examine a new observation/action and provide it a +reward based on how close it believes this new observation/action is to the provided demonstrations. + +At each training step, the agent tries to learn how to maximize this reward. Then, the +discriminator is trained to better distinguish between demonstrations and agent state/actions. +In this way, while the agent gets better and better at mimicing the demonstrations, the +discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it. + +This approach, when compared to [Behavioral Cloning](Training-BehavioralCloning.md), requires +far fewer demonstrations to be provided. After all, we are still learning a policy that happens +to be similar to the demonstration, not directly copying the behavior of the demonstrations. It +is also especially effective when combined with an Extrinsic signal, but can also be used +independently to purely learn from demonstration. + +Using GAIL requires recorded demonstrations from your Unity environment. See the +[imitation learning guide](Training-Imitation-Learning.md) to learn more about recording demonstrations. + +#### Strength + +`strength` is the factor by which to multiply the raw reward. Note that when using GAIL +with an Extrinsic Signal, this value should be set lower if your demonstrations are +suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic +rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. + +Typical Range: `0.01` - `1.0` + +#### Gamma + +`gamma` corresponds to the discount factor for future rewards. + +Typical Range: `0.8` - `0.9` + +#### Demo Path + +`demo_path` is the path to your `.demo` file or directory of `.demo` files. See the [imitation learning guide] +(Training-ImitationLearning.md). + +#### Encoding Size + +`encoding_size` corresponds to the size of the hidden layer used by the discriminator. +This value should be small enough to encourage the discriminator to compress the original +observation, but also not too small to prevent it from learning to differentiate between +demonstrated and actual behavior. Dramatically increasing this size will also negatively affect +training times. + +Default Value: `64` + +Typical Range: `64` - `256` + +#### Learning Rate + +`learning_rate` is the learning rate used to update the discriminator. +This should typically be decreased if training is unstable, and the GAIL loss is unstable. + +Default Value: `3e-4` + Typical Range: `1e-5` - `1e-3` + +#### Use Actions + +`use_actions` determines whether the discriminator should discriminate based on both +observations and actions, or just observations. Set to `True` if you want the agent to +mimic the actions from the demonstrations, and `False` if you'd rather have the agent +visit the same states as in the demonstrations but with possibly different actions. +Setting to `False` is more likely to be stable, especially with imperfect demonstrations, +but may learn slower. + +Default Value: `false` + +#### (Optional) Samples Per Update + +`samples_per_update` is the maximum number of samples to use during each discriminator update. You may +want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data. +If set to 0, we will use the minimum of buffer size and the number of demonstration samples. + +Default Value: `0` + +Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md) + +#### (Optional) Variational Discriminator Bottleneck + +`use_vail` enables a [variational bottleneck](https://arxiv.org/abs/1810.00821) within the +GAIL discriminator. This forces the discriminator to learn a more general representation +and reduces its tendency to be "too good" at discriminating, making learning more stable. +However, it does increase training time. Enable this if you notice your imitation learning is +unstable, or unable to learn the task at hand. + +Default Value: `false` diff --git a/docs/images/mlagents-ImitationAndRL.png b/docs/images/mlagents-ImitationAndRL.png new file mode 100644 index 0000000000..ffa61d1b11 Binary files /dev/null and b/docs/images/mlagents-ImitationAndRL.png differ diff --git a/ml-agents/mlagents/trainers/components/bc/__init__.py b/ml-agents/mlagents/trainers/components/bc/__init__.py new file mode 100644 index 0000000000..159875b09f --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/__init__.py @@ -0,0 +1 @@ +from .module import BCModule diff --git a/ml-agents/mlagents/trainers/components/bc/model.py b/ml-agents/mlagents/trainers/components/bc/model.py new file mode 100644 index 0000000000..56de70e6d2 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/model.py @@ -0,0 +1,101 @@ +import tensorflow as tf +import numpy as np +from mlagents.trainers.models import LearningModel + + +class BCModel(object): + def __init__( + self, + policy_model: LearningModel, + learning_rate: float = 3e-4, + anneal_steps: int = 0, + ): + """ + Tensorflow operations to perform Behavioral Cloning on a Policy model + :param policy_model: The policy of the learning algorithm + :param lr: The initial learning Rate for behavioral cloning + :param anneal_steps: Number of steps over which to anneal BC training + """ + self.policy_model = policy_model + self.expert_visual_in = self.policy_model.visual_in + self.obs_in_expert = self.policy_model.vector_in + self.make_inputs() + self.create_loss(learning_rate, anneal_steps) + + def make_inputs(self) -> None: + """ + Creates the input layers for the discriminator + """ + self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) + self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) + + if self.policy_model.brain.vector_action_space_type == "continuous": + action_length = self.policy_model.act_size[0] + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.float32 + ) + self.expert_action = tf.identity(self.action_in_expert) + else: + action_length = len(self.policy_model.act_size) + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.int32 + ) + self.expert_action = tf.concat( + [ + tf.one_hot( + self.action_in_expert[:, i], self.policy_model.act_size[i] + ) + for i in range(len(self.policy_model.act_size)) + ], + axis=1, + ) + + def create_loss(self, learning_rate: float, anneal_steps: int) -> None: + """ + Creates the loss and update nodes for the BC module + :param learning_rate: The learning rate for the optimizer + :param anneal_steps: Number of steps over which to anneal the learning_rate + """ + selected_action = self.policy_model.output + action_size = self.policy_model.act_size + if self.policy_model.brain.vector_action_space_type == "continuous": + self.loss = tf.reduce_mean( + tf.squared_difference(selected_action, self.expert_action) + ) + else: + log_probs = self.policy_model.all_log_probs + action_idx = [0] + list(np.cumsum(action_size)) + entropy = tf.reduce_sum( + ( + tf.stack( + [ + tf.nn.softmax_cross_entropy_with_logits_v2( + labels=tf.nn.softmax( + log_probs[:, action_idx[i] : action_idx[i + 1]] + ), + logits=log_probs[:, action_idx[i] : action_idx[i + 1]], + ) + for i in range(len(action_size)) + ], + axis=1, + ) + ), + axis=1, + ) + self.loss = tf.reduce_mean( + -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action + ) + + if anneal_steps > 0: + self.annealed_learning_rate = tf.train.polynomial_decay( + learning_rate, + self.policy_model.global_step, + anneal_steps, + 0.0, + power=1.0, + ) + else: + self.annealed_learning_rate = learning_rate + + optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate) + self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/components/bc/module.py b/ml-agents/mlagents/trainers/components/bc/module.py new file mode 100644 index 0000000000..a3b7f0418c --- /dev/null +++ b/ml-agents/mlagents/trainers/components/bc/module.py @@ -0,0 +1,172 @@ +from typing import Dict, Any +import numpy as np + +from mlagents.trainers.tf_policy import TFPolicy +from .model import BCModel +from mlagents.trainers.demo_loader import demo_to_buffer +from mlagents.trainers.trainer import UnityTrainerException + + +class BCModule: + def __init__( + self, + policy: TFPolicy, + policy_learning_rate: float, + default_batch_size: int, + default_num_epoch: int, + strength: float, + demo_path: str, + steps: int, + batch_size: int = None, + num_epoch: int = None, + samples_per_update: int = 0, + ): + """ + A BC trainer that can be used inline with RL, especially for pretraining. + :param policy: The policy of the learning model + :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate for the pretrainer. + :param default_batch_size: The default batch size to use if batch_size isn't provided. + :param default_num_epoch: The default num_epoch to use if num_epoch isn't provided. + :param strength: The proportion of learning rate used to update through BC. + :param steps: The number of steps to anneal BC training over. 0 for continuous training. + :param demo_path: The path to the demonstration file. + :param batch_size: The batch size to use during BC training. + :param num_epoch: Number of epochs to train for during each update. + :param samples_per_update: Maximum number of samples to train on during each pretraining update. + """ + self.policy = policy + self.current_lr = policy_learning_rate * strength + self.model = BCModel(policy.model, self.current_lr, steps) + _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length) + + self.batch_size = batch_size if batch_size else default_batch_size + self.num_epoch = num_epoch if num_epoch else default_num_epoch + self.n_sequences = max( + min( + self.batch_size, len(self.demonstration_buffer.update_buffer["actions"]) + ) + // policy.sequence_length, + 1, + ) + + self.has_updated = False + self.use_recurrent = self.policy.use_recurrent + self.samples_per_update = samples_per_update + self.out_dict = { + "loss": self.model.loss, + "update": self.model.update_batch, + "learning_rate": self.model.annealed_learning_rate, + } + + @staticmethod + def check_config(config_dict: Dict[str, Any]) -> None: + """ + Check the pretraining config for the required keys. + :param config_dict: Pretraining section of trainer_config + """ + param_keys = ["strength", "demo_path", "steps"] + for k in param_keys: + if k not in config_dict: + raise UnityTrainerException( + "The required pre-training hyper-parameter {0} was not defined. Please check your \ + trainer YAML file.".format( + k + ) + ) + + def update(self) -> Dict[str, Any]: + """ + Updates model using buffer. + :param max_batches: The maximum number of batches to use per update. + :return: The loss of the update. + """ + # Don't continue training if the learning rate has reached 0, to reduce training time. + if self.current_lr <= 0: + return {"Losses/Pretraining Loss": 0} + + batch_losses = [] + possible_demo_batches = ( + len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences + ) + possible_batches = possible_demo_batches + + max_batches = self.samples_per_update // self.n_sequences + + n_epoch = self.num_epoch + for _ in range(n_epoch): + self.demonstration_buffer.update_buffer.shuffle() + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches): + demo_update_buffer = self.demonstration_buffer.update_buffer + start = i * self.n_sequences + end = (i + 1) * self.n_sequences + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, self.n_sequences) + loss = run_out["loss"] + self.current_lr = run_out["learning_rate"] + batch_losses.append(loss) + self.has_updated = True + update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)} + return update_stats + + def _update_batch( + self, mini_batch_demo: Dict[str, Any], n_sequences: int + ) -> Dict[str, Any]: + """ + Helper function for update_batch. + """ + feed_dict = { + self.policy.model.batch_size: n_sequences, + self.policy.model.sequence_length: self.policy.sequence_length, + } + if self.policy.model.brain.vector_action_space_type == "continuous": + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, self.policy.model.brain.vector_action_space_size[0]] + ) + feed_dict[self.policy.model.epsilon] = np.random.normal( + size=(1, self.policy.model.act_size[0]) + ) + else: + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, len(self.policy.model.brain.vector_action_space_size)] + ) + feed_dict[self.policy.model.action_masks] = np.ones( + ( + self.n_sequences, + sum(self.policy.model.brain.vector_action_space_size), + ) + ) + if self.policy.model.brain.vector_observation_space_size > 0: + apparent_obs_size = ( + self.policy.model.brain.vector_observation_space_size + * self.policy.model.brain.num_stacked_vector_observations + ) + feed_dict[self.policy.model.vector_in] = mini_batch_demo[ + "vector_obs" + ].reshape([-1, apparent_obs_size]) + for i, _ in enumerate(self.policy.model.visual_in): + visual_obs = mini_batch_demo["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = visual_obs.shape + feed_dict[self.policy.model.visual_in[i]] = visual_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.policy.model.visual_in[i]] = visual_obs + if self.use_recurrent: + feed_dict[self.policy.model.memory_in] = np.zeros( + [self.n_sequences, self.policy.m_size] + ) + if not self.policy.model.brain.vector_action_space_type == "continuous": + feed_dict[self.policy.model.prev_action] = mini_batch_demo[ + "prev_action" + ].reshape([-1, len(self.policy.model.act_size)]) + + network_out = self.policy.sess.run( + list(self.out_dict.values()), feed_dict=feed_dict + ) + run_out = dict(zip(list(self.out_dict.keys()), network_out)) + return run_out diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py index fa283f3338..68ffce311d 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py @@ -17,9 +17,12 @@ def __init__( """ Creates the Curiosity reward generator :param policy: The Learning Policy - :param encoding_size: The size of the Curiosity encoding - :param signal_strength: The scaling parameter for the reward. The scaled reward will be the unscaled + :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled reward multiplied by the strength parameter + :param gamma: The time discounting factor used for this reward. + :param encoding_size: The size of the hidden encoding layer for the ICM + :param learning_rate: The learning rate for the ICM. + :param num_epoch: The number of epochs to train over the training buffer for the ICM. """ super().__init__(policy, strength, gamma) self.model = CuriosityModel( diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py new file mode 100644 index 0000000000..77c38345ea --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/__init__.py @@ -0,0 +1 @@ +from .signal import GAILRewardSignal diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py new file mode 100644 index 0000000000..bf040361d8 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py @@ -0,0 +1,265 @@ +from typing import Tuple, List + +import tensorflow as tf +from mlagents.trainers.models import LearningModel + + +class GAILModel(object): + def __init__( + self, + policy_model: LearningModel, + h_size: int = 128, + learning_rate: float = 3e-4, + encoding_size: int = 64, + use_actions: bool = False, + use_vail: bool = False, + ): + """ + The initializer for the GAIL reward generator. + https://arxiv.org/abs/1606.03476 + :param policy_model: The policy of the learning algorithm + :param h_size: Size of the hidden layer for the discriminator + :param learning_rate: The learning Rate for the discriminator + :param encoding_size: The encoding size for the encoder + :param use_actions: Whether or not to use actions to discriminate + :param use_vail: Whether or not to use a variational bottleneck for the + discriminator. See https://arxiv.org/abs/1810.00821. + """ + self.h_size = h_size + self.z_size = 128 + self.alpha = 0.0005 + self.mutual_information = 0.5 + self.policy_model = policy_model + self.encoding_size = encoding_size + self.use_vail = use_vail + self.use_actions = use_actions # True # Not using actions + self.make_beta() + self.make_inputs() + self.create_network() + self.create_loss(learning_rate) + + def make_beta(self) -> None: + """ + Creates the beta parameter and its updater for GAIL + """ + self.beta = tf.get_variable( + "gail_beta", + [], + trainable=False, + dtype=tf.float32, + initializer=tf.ones_initializer(), + ) + self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32) + new_beta = tf.maximum( + self.beta + self.alpha * (self.kl_div_input - self.mutual_information), 1e-7 + ) + self.update_beta = tf.assign(self.beta, new_beta) + + def make_inputs(self) -> None: + """ + Creates the input layers for the discriminator + """ + self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) + self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) + + if self.policy_model.brain.vector_action_space_type == "continuous": + action_length = self.policy_model.act_size[0] + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.float32 + ) + self.expert_action = tf.identity(self.action_in_expert) + else: + action_length = len(self.policy_model.act_size) + self.action_in_expert = tf.placeholder( + shape=[None, action_length], dtype=tf.int32 + ) + self.expert_action = tf.concat( + [ + tf.one_hot( + self.action_in_expert[:, i], self.policy_model.act_size[i] + ) + for i in range(len(self.policy_model.act_size)) + ], + axis=1, + ) + + encoded_policy_list = [] + encoded_expert_list = [] + + if self.policy_model.vec_obs_size > 0: + self.obs_in_expert = tf.placeholder( + shape=[None, self.policy_model.vec_obs_size], dtype=tf.float32 + ) + if self.policy_model.normalize: + encoded_expert_list.append( + self.policy_model.normalize_vector_obs(self.obs_in_expert) + ) + encoded_policy_list.append( + self.policy_model.normalize_vector_obs(self.policy_model.vector_in) + ) + else: + encoded_expert_list.append(self.obs_in_expert) + encoded_policy_list.append(self.policy_model.vector_in) + + if self.policy_model.vis_obs_size > 0: + self.expert_visual_in: List[tf.Tensor] = [] + visual_policy_encoders = [] + visual_expert_encoders = [] + for i in range(self.policy_model.vis_obs_size): + # Create input ops for next (t+1) visual observations. + visual_input = self.policy_model.create_visual_input( + self.policy_model.brain.camera_resolutions[i], + name="visual_observation_" + str(i), + ) + self.expert_visual_in.append(visual_input) + + encoded_policy_visual = self.policy_model.create_visual_observation_encoder( + self.policy_model.visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + False, + ) + + encoded_expert_visual = self.policy_model.create_visual_observation_encoder( + self.expert_visual_in[i], + self.encoding_size, + LearningModel.swish, + 1, + "stream_{}_visual_obs_encoder".format(i), + True, + ) + visual_policy_encoders.append(encoded_policy_visual) + visual_expert_encoders.append(encoded_expert_visual) + hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) + hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) + encoded_policy_list.append(hidden_policy_visual) + encoded_expert_list.append(hidden_expert_visual) + + self.encoded_expert = tf.concat(encoded_expert_list, axis=1) + self.encoded_policy = tf.concat(encoded_policy_list, axis=1) + + def create_encoder( + self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool + ) -> Tuple[tf.Tensor, tf.Tensor]: + """ + Creates the encoder for the discriminator + :param state_in: The encoded observation input + :param action_in: The action input + :param done_in: The done flags input + :param reuse: If true, the weights will be shared with the previous encoder created + """ + with tf.variable_scope("GAIL_model"): + if self.use_actions: + concat_input = tf.concat([state_in, action_in, done_in], axis=1) + else: + concat_input = state_in + + hidden_1 = tf.layers.dense( + concat_input, + self.h_size, + activation=LearningModel.swish, + name="d_hidden_1", + reuse=reuse, + ) + + hidden_2 = tf.layers.dense( + hidden_1, + self.h_size, + activation=LearningModel.swish, + name="d_hidden_2", + reuse=reuse, + ) + + z_mean = None + if self.use_vail: + # Latent representation + z_mean = tf.layers.dense( + hidden_2, + self.z_size, + reuse=reuse, + name="z_mean", + kernel_initializer=LearningModel.scaled_init(0.01), + ) + + self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) + + # Sampled latent code + self.z = z_mean + self.z_sigma * self.noise * self.use_noise + estimate_input = self.z + else: + estimate_input = hidden_2 + + estimate = tf.layers.dense( + estimate_input, + 1, + activation=tf.nn.sigmoid, + name="d_estimate", + reuse=reuse, + ) + return estimate, z_mean + + def create_network(self) -> None: + """ + Helper for creating the intrinsic reward nodes + """ + if self.use_vail: + self.z_sigma = tf.get_variable( + "sigma_vail", + self.z_size, + dtype=tf.float32, + initializer=tf.ones_initializer(), + ) + self.z_sigma_sq = self.z_sigma * self.z_sigma + self.z_log_sigma_sq = tf.log(self.z_sigma_sq + 1e-7) + self.use_noise = tf.placeholder( + shape=[1], dtype=tf.float32, name="NoiseLevel" + ) + self.expert_estimate, self.z_mean_expert = self.create_encoder( + self.encoded_expert, self.expert_action, self.done_expert, reuse=False + ) + self.policy_estimate, self.z_mean_policy = self.create_encoder( + self.encoded_policy, + self.policy_model.selected_actions, + self.done_policy, + reuse=True, + ) + self.discriminator_score = tf.reshape( + self.policy_estimate, [-1], name="GAIL_reward" + ) + self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + 1e-7) + + def create_loss(self, learning_rate: float) -> None: + """ + Creates the loss and update nodes for the GAIL reward generator + :param learning_rate: The learning rate for the optimizer + """ + self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) + self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) + + self.discriminator_loss = -tf.reduce_mean( + tf.log(self.expert_estimate + 1e-7) + + tf.log(1.0 - self.policy_estimate + 1e-7) + ) + + if self.use_vail: + # KL divergence loss (encourage latent representation to be normal) + self.kl_loss = tf.reduce_mean( + -tf.reduce_sum( + 1 + + self.z_log_sigma_sq + - 0.5 * tf.square(self.z_mean_expert) + - 0.5 * tf.square(self.z_mean_policy) + - tf.exp(self.z_log_sigma_sq), + 1, + ) + ) + self.loss = ( + self.beta * (self.kl_loss - self.mutual_information) + + self.discriminator_loss + ) + else: + self.loss = self.discriminator_loss + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + self.update_batch = optimizer.minimize(self.loss) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py new file mode 100644 index 0000000000..b6e62ae913 --- /dev/null +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py @@ -0,0 +1,270 @@ +from typing import Any, Dict, List +import logging +import numpy as np +import tensorflow as tf + +from mlagents.envs.brain import BrainInfo +from mlagents.trainers.buffer import Buffer +from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult +from mlagents.trainers.tf_policy import TFPolicy +from .model import GAILModel +from mlagents.trainers.demo_loader import demo_to_buffer + +LOGGER = logging.getLogger("mlagents.trainers") + + +class GAILRewardSignal(RewardSignal): + def __init__( + self, + policy: TFPolicy, + strength: float, + gamma: float, + demo_path: str, + num_epoch: int = 3, + encoding_size: int = 64, + learning_rate: float = 3e-4, + samples_per_update: int = 0, + use_actions: bool = False, + use_vail: bool = False, + ): + """ + The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476 + :param policy: The policy of the learning model + :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled + reward multiplied by the strength parameter + :param gamma: The time discounting factor used for this reward. + :param demo_path: The path to the demonstration file + :param encoding_size: The size of the the hidden layers of the discriminator + :param learning_rate: The Learning Rate used during GAIL updates. + :param samples_per_update: The maximum number of samples to update during GAIL updates. + :param use_actions: Whether or not to use the actions for the discriminator. + :param use_vail: Whether or not to use a variational bottleneck for the discriminator. + See https://arxiv.org/abs/1810.00821. + """ + super().__init__(policy, strength, gamma) + self.num_epoch = num_epoch + self.samples_per_update = samples_per_update + + self.model = GAILModel( + policy.model, 128, learning_rate, encoding_size, use_actions, use_vail + ) + _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length) + self.has_updated = False + + def evaluate( + self, current_info: BrainInfo, next_info: BrainInfo + ) -> RewardSignalResult: + if len(current_info.agents) == 0: + return [] + + feed_dict: Dict[tf.Tensor, Any] = { + self.policy.model.batch_size: len(next_info.vector_observations), + self.policy.model.sequence_length: 1, + } + if self.model.use_vail: + feed_dict[self.model.use_noise] = [0] + + feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) + feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1]) + if self.policy.use_continuous_act: + feed_dict[ + self.policy.model.selected_actions + ] = next_info.previous_vector_actions + else: + feed_dict[ + self.policy.model.action_holder + ] = next_info.previous_vector_actions + if self.policy.use_recurrent: + if current_info.memories.shape[1] == 0: + current_info.memories = self.policy.make_empty_memory( + len(current_info.agents) + ) + feed_dict[self.policy.model.memory_in] = current_info.memories + unscaled_reward = self.policy.sess.run( + self.model.intrinsic_reward, feed_dict=feed_dict + ) + scaled_reward = unscaled_reward * float(self.has_updated) * self.strength + return RewardSignalResult(scaled_reward, unscaled_reward) + + @classmethod + def check_config( + cls, config_dict: Dict[str, Any], param_keys: List[str] = None + ) -> None: + """ + Checks the config and throw an exception if a hyperparameter is missing. GAIL requires strength and gamma + at minimum. + """ + param_keys = ["strength", "gamma", "demo_path"] + super().check_config(config_dict, param_keys) + + def update(self, update_buffer: Buffer, n_sequences: int) -> Dict[str, float]: + """ + Updates model using buffer. + :param update_buffer: The policy buffer containing the trajectories for the current policy. + :param n_sequences: The number of sequences from demo and policy used in each mini batch. + :return: The loss of the update. + """ + batch_losses = [] + # Divide by 2 since we have two buffers, so we have roughly the same batch size + n_sequences = max(n_sequences // 2, 1) + possible_demo_batches = ( + len(self.demonstration_buffer.update_buffer["actions"]) // n_sequences + ) + possible_policy_batches = len(update_buffer["actions"]) // n_sequences + possible_batches = min(possible_policy_batches, possible_demo_batches) + + max_batches = self.samples_per_update // n_sequences + + kl_loss = [] + policy_estimate = [] + expert_estimate = [] + z_log_sigma_sq = [] + z_mean_expert = [] + z_mean_policy = [] + + n_epoch = self.num_epoch + for _epoch in range(n_epoch): + self.demonstration_buffer.update_buffer.shuffle() + update_buffer.shuffle() + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches): + demo_update_buffer = self.demonstration_buffer.update_buffer + policy_update_buffer = update_buffer + start = i * n_sequences + end = (i + 1) * n_sequences + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + mini_batch_policy = policy_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, mini_batch_policy) + loss = run_out["gail_loss"] + + policy_estimate.append(run_out["policy_estimate"]) + expert_estimate.append(run_out["expert_estimate"]) + if self.model.use_vail: + kl_loss.append(run_out["kl_loss"]) + z_log_sigma_sq.append(run_out["z_log_sigma_sq"]) + z_mean_policy.append(run_out["z_mean_policy"]) + z_mean_expert.append(run_out["z_mean_expert"]) + + batch_losses.append(loss) + self.has_updated = True + + print_list = ["n_epoch", "beta", "policy_estimate", "expert_estimate"] + print_vals = [ + n_epoch, + self.policy.sess.run(self.model.beta), + np.mean(policy_estimate), + np.mean(expert_estimate), + ] + if self.model.use_vail: + print_list += [ + "kl_loss", + "z_mean_expert", + "z_mean_policy", + "z_log_sigma_sq", + ] + print_vals += [ + np.mean(kl_loss), + np.mean(z_mean_expert), + np.mean(z_mean_policy), + np.mean(z_log_sigma_sq), + ] + LOGGER.debug( + "GAIL Debug:\n\t\t" + + "\n\t\t".join( + "{0}: {1}".format(_name, _val) + for _name, _val in zip(print_list, print_vals) + ) + ) + update_stats = {"Losses/GAIL Loss": np.mean(batch_losses)} + return update_stats + + def _update_batch( + self, + mini_batch_demo: Dict[str, np.ndarray], + mini_batch_policy: Dict[str, np.ndarray], + ) -> Dict[str, float]: + """ + Helper method for update. + :param mini_batch_demo: A mini batch of expert trajectories + :param mini_batch_policy: A mini batch of trajectories sampled from the current policy + :return: Output from update process. + """ + feed_dict: Dict[tf.Tensor, Any] = { + self.model.done_expert: mini_batch_demo["done"].reshape([-1, 1]), + self.model.done_policy: mini_batch_policy["done"].reshape([-1, 1]), + } + + if self.model.use_vail: + feed_dict[self.model.use_noise] = [1] + + if self.policy.use_continuous_act: + feed_dict[self.policy.model.selected_actions] = mini_batch_policy[ + "actions" + ].reshape([-1, self.policy.model.act_size[0]]) + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, self.policy.model.act_size[0]] + ) + else: + feed_dict[self.policy.model.action_holder] = mini_batch_policy[ + "actions" + ].reshape([-1, len(self.policy.model.act_size)]) + feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape( + [-1, len(self.policy.model.act_size)] + ) + + if self.policy.use_vis_obs > 0: + for i in range(len(self.policy.model.visual_in)): + policy_obs = mini_batch_policy["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = policy_obs.shape + feed_dict[self.policy.model.visual_in[i]] = policy_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.policy.model.visual_in[i]] = policy_obs + + demo_obs = mini_batch_demo["visual_obs%d" % i] + if self.policy.sequence_length > 1 and self.policy.use_recurrent: + (_batch, _seq, _w, _h, _c) = demo_obs.shape + feed_dict[self.model.expert_visual_in[i]] = demo_obs.reshape( + [-1, _w, _h, _c] + ) + else: + feed_dict[self.model.expert_visual_in[i]] = demo_obs + if self.policy.use_vec_obs: + feed_dict[self.policy.model.vector_in] = mini_batch_policy[ + "vector_obs" + ].reshape([-1, self.policy.vec_obs_size]) + feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"].reshape( + [-1, self.policy.vec_obs_size] + ) + + out_dict = { + "gail_loss": self.model.loss, + "update_batch": self.model.update_batch, + "policy_estimate": self.model.policy_estimate, + "expert_estimate": self.model.expert_estimate, + } + if self.model.use_vail: + out_dict["kl_loss"] = self.model.kl_loss + out_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq + out_dict["z_mean_expert"] = self.model.z_mean_expert + out_dict["z_mean_policy"] = self.model.z_mean_policy + + run_out = self.policy.sess.run(out_dict, feed_dict=feed_dict) + if self.model.use_vail: + self.update_beta(run_out["kl_loss"]) + return run_out + + def update_beta(self, kl_div: float) -> None: + """ + Updates the Beta parameter with the latest kl_divergence value. + The larger Beta, the stronger the importance of the kl divergence in the loss function. + :param kl_div: The KL divergence + """ + self.policy.sess.run( + self.model.update_beta, feed_dict={self.model.kl_div_input: kl_div} + ) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py index 332b33498c..0866147e21 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py @@ -6,6 +6,7 @@ from mlagents.trainers.components.reward_signals.extrinsic.signal import ( ExtrinsicRewardSignal, ) +from mlagents.trainers.components.reward_signals.gail.signal import GAILRewardSignal from mlagents.trainers.components.reward_signals.curiosity.signal import ( CuriosityRewardSignal, ) @@ -17,6 +18,7 @@ NAME_TO_CLASS: Dict[str, Type[RewardSignal]] = { "extrinsic": ExtrinsicRewardSignal, "curiosity": CuriosityRewardSignal, + "gail": GAILRewardSignal, } diff --git a/ml-agents/mlagents/trainers/demo_loader.py b/ml-agents/mlagents/trainers/demo_loader.py index f9e5e3b39d..2043ea4d85 100644 --- a/ml-agents/mlagents/trainers/demo_loader.py +++ b/ml-agents/mlagents/trainers/demo_loader.py @@ -1,16 +1,23 @@ import pathlib import logging import os +from typing import List, Tuple from mlagents.trainers.buffer import Buffer from mlagents.envs.brain import BrainParameters, BrainInfo -from mlagents.envs.communicator_objects import * +from mlagents.envs.communicator_objects import ( + AgentInfoProto, + BrainParametersProto, + DemonstrationMetaProto, +) from google.protobuf.internal.decoder import _DecodeVarint32 # type: ignore logger = logging.getLogger("mlagents.trainers") -def make_demo_buffer(brain_infos, brain_params, sequence_length): +def make_demo_buffer( + brain_infos: List[BrainInfo], brain_params: BrainParameters, sequence_length: int +) -> Buffer: # Create and populate buffer using experiences demo_buffer = Buffer() for idx, experience in enumerate(brain_infos): @@ -44,7 +51,9 @@ def make_demo_buffer(brain_infos, brain_params, sequence_length): return demo_buffer -def demo_to_buffer(file_path, sequence_length): +def demo_to_buffer( + file_path: str, sequence_length: int +) -> Tuple[BrainParameters, Buffer]: """ Loads demonstration file and uses it to fill training buffer. :param file_path: Location of demonstration file (.demo). @@ -56,7 +65,7 @@ def demo_to_buffer(file_path, sequence_length): return brain_params, demo_buffer -def load_demonstration(file_path): +def load_demonstration(file_path: str) -> Tuple[BrainParameters, List[BrainInfo], int]: """ Loads and parses a demonstration file. :param file_path: Location of demonstration file (.demo). @@ -70,32 +79,34 @@ def load_demonstration(file_path): all_files = os.listdir(file_path) for _file in all_files: if _file.endswith(".demo"): - file_paths.append(_file) + file_paths.append(os.path.join(file_path, _file)) + if not all_files: + raise ValueError("There are no '.demo' files in the provided directory.") elif os.path.isfile(file_path): file_paths.append(file_path) + file_extension = pathlib.Path(file_path).suffix + if file_extension != ".demo": + raise ValueError( + "The file is not a '.demo' file. Please provide a file with the " + "correct extension." + ) else: raise FileNotFoundError( "The demonstration file or directory {} does not exist.".format(file_path) ) - file_extension = pathlib.Path(file_path).suffix - if file_extension != ".demo": - raise ValueError( - "The file is not a '.demo' file. Please provide a file with the " - "correct extension." - ) brain_params = None brain_infos = [] + total_expected = 0 for _file_path in file_paths: data = open(_file_path, "rb").read() next_pos, pos, obs_decoded = 0, 0, 0 - total_expected = 0 while pos < len(data): next_pos, pos = _DecodeVarint32(data, pos) if obs_decoded == 0: meta_data_proto = DemonstrationMetaProto() meta_data_proto.ParseFromString(data[pos : pos + next_pos]) - total_expected = meta_data_proto.number_steps + total_expected += meta_data_proto.number_steps pos = INITIAL_POS if obs_decoded == 1: brain_param_proto = BrainParametersProto() diff --git a/ml-agents/mlagents/trainers/ppo/policy.py b/ml-agents/mlagents/trainers/ppo/policy.py index 0c010a0c97..9a8c3dc417 100644 --- a/ml-agents/mlagents/trainers/ppo/policy.py +++ b/ml-agents/mlagents/trainers/ppo/policy.py @@ -7,6 +7,7 @@ from mlagents.trainers.components.reward_signals.reward_signal_factory import ( create_reward_signal, ) +from mlagents.trainers.components.bc.module import BCModule logger = logging.getLogger("mlagents.trainers") @@ -49,6 +50,19 @@ def __init__(self, seed, brain, trainer_params, is_training, load): self, reward_signal, config ) + # Create pretrainer if needed + if "pretraining" in trainer_params: + BCModule.check_config(trainer_params["pretraining"]) + self.bc_module = BCModule( + self, + policy_learning_rate=trainer_params["learning_rate"], + default_batch_size=trainer_params["batch_size"], + default_num_epoch=trainer_params["num_epoch"], + **trainer_params["pretraining"], + ) + else: + self.bc_module = None + if load: self._load_graph() else: diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index a50cebb4d3..3d1249e290 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -473,6 +473,10 @@ def update_policy(self): ) for stat, val in update_stats.items(): self.stats[stat].append(val) + if self.policy.bc_module: + update_stats = self.policy.bc_module.update() + for stat, val in update_stats.items(): + self.stats[stat].append(val) self.training_buffer.reset_update_buffer() self.trainer_metrics.end_policy_update() diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index 96c0e3fe0c..1fcfbbc710 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -2,6 +2,8 @@ import pytest import numpy as np +from mlagents.trainers.buffer import Buffer + def create_mock_brainparams( number_visual_observations=0, @@ -90,3 +92,50 @@ def setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo): mock_env.return_value.brain_names = ["MockBrain"] mock_env.return_value.reset.return_value = {"MockBrain": mock_braininfo} mock_env.return_value.step.return_value = {"MockBrain": mock_braininfo} + + +def simulate_rollout(env, policy, buffer_init_samples): + brain_info_list = [] + for i in range(buffer_init_samples): + brain_info_list.append(env.step()[env.brain_names[0]]) + buffer = create_buffer(brain_info_list, policy.brain, policy.sequence_length) + return buffer + + +def create_buffer(brain_infos, brain_params, sequence_length): + buffer = Buffer() + # Make a buffer + for idx, experience in enumerate(brain_infos): + if idx > len(brain_infos) - 2: + break + current_brain_info = brain_infos[idx] + next_brain_info = brain_infos[idx + 1] + buffer[0].last_brain_info = current_brain_info + buffer[0]["done"].append(next_brain_info.local_done[0]) + buffer[0]["rewards"].append(next_brain_info.rewards[0]) + for i in range(brain_params.number_visual_observations): + buffer[0]["visual_obs%d" % i].append( + current_brain_info.visual_observations[i][0] + ) + buffer[0]["next_visual_obs%d" % i].append( + current_brain_info.visual_observations[i][0] + ) + if brain_params.vector_observation_space_size > 0: + buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0]) + buffer[0]["next_vector_in"].append( + current_brain_info.vector_observations[0] + ) + buffer[0]["actions"].append(next_brain_info.previous_vector_actions[0]) + buffer[0]["prev_action"].append(current_brain_info.previous_vector_actions[0]) + buffer[0]["masks"].append(1.0) + buffer[0]["advantages"].append(1.0) + buffer[0]["action_probs"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["random_normal_epsilon"].append( + np.ones(buffer[0]["actions"][0].shape) + ) + buffer[0]["action_mask"].append(np.ones(buffer[0]["actions"][0].shape)) + buffer[0]["memory"].append(np.ones(8)) + + buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) + return buffer diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py new file mode 100644 index 0000000000..0eee0f4d2e --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py @@ -0,0 +1,158 @@ +import unittest.mock as mock +import pytest +import mlagents.trainers.tests.mock_brain as mb + +import numpy as np +import yaml +import os + +from mlagents.trainers.ppo.policy import PPOPolicy + + +@pytest.fixture +def dummy_config(): + return yaml.safe_load( + """ + trainer: ppo + batch_size: 32 + beta: 5.0e-3 + buffer_size: 512 + epsilon: 0.2 + hidden_units: 128 + lambd: 0.95 + learning_rate: 3.0e-4 + max_steps: 5.0e4 + normalize: true + num_epoch: 5 + num_layers: 2 + time_horizon: 64 + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + memory_size: 8 + pretraining: + demo_path: ./demos/ExpertPyramid.demo + strength: 1.0 + steps: 10000000 + reward_signals: + extrinsic: + strength: 1.0 + gamma: 0.99 + """ + ) + + +def create_mock_3dball_brain(): + mock_brain = mb.create_mock_brainparams( + vector_action_space_type="continuous", + vector_action_space_size=[2], + vector_observation_space_size=8, + ) + return mock_brain + + +def create_mock_banana_brain(): + mock_brain = mb.create_mock_brainparams( + number_visual_observations=1, + vector_action_space_type="discrete", + vector_action_space_size=[3, 3, 3, 2], + vector_observation_space_size=0, + ) + return mock_brain + + +def create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, use_rnn, demo_file +): + mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) + mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) + env = mock_env() + + trainer_parameters = dummy_config + model_path = env.brain_names[0] + trainer_parameters["model_path"] = model_path + trainer_parameters["keep_checkpoints"] = 3 + trainer_parameters["use_recurrent"] = use_rnn + trainer_parameters["pretraining"]["demo_path"] = ( + os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file + ) + policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) + return env, policy + + +# Test default values +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_defaults(mock_env, dummy_config): + # See if default values match + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + assert policy.bc_module.num_epoch == dummy_config["num_epoch"] + assert policy.bc_module.batch_size == dummy_config["batch_size"] + env.close() + # Assign strange values and see if it overrides properly + dummy_config["pretraining"]["num_epoch"] = 100 + dummy_config["pretraining"]["batch_size"] = 10000 + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + assert policy.bc_module.num_epoch == 100 + assert policy.bc_module.batch_size == 10000 + env.close() + + +# Test with continuous control env and vector actions +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_update(mock_env, dummy_config): + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "test.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with RNN +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_rnn_update(mock_env, dummy_config): + mock_brain = create_mock_3dball_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, True, "test.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with discrete control and visual observations +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_dc_visual_update(mock_env, dummy_config): + mock_brain = create_mock_banana_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, False, "testdcvis.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +# Test with discrete control, visual observations and RNN +@mock.patch("mlagents.envs.UnityEnvironment") +def test_bcmodule_rnn_dc_update(mock_env, dummy_config): + mock_brain = create_mock_banana_brain() + env, policy = create_ppo_policy_with_bc_mock( + mock_env, mock_brain, dummy_config, True, "testdcvis.demo" + ) + stats = policy.bc_module.update() + for _, item in stats.items(): + assert isinstance(item, np.float32) + env.close() + + +if __name__ == "__main__": + pytest.main() diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test2.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo b/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo new file mode 100644 index 0000000000..3148108ca0 Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/test_demo_dir/test3.demo differ diff --git a/ml-agents/mlagents/trainers/tests/test_demo_loader.py b/ml-agents/mlagents/trainers/tests/test_demo_loader.py index b6c029780e..765c2df18e 100644 --- a/ml-agents/mlagents/trainers/tests/test_demo_loader.py +++ b/ml-agents/mlagents/trainers/tests/test_demo_loader.py @@ -16,3 +16,16 @@ def test_load_demo(): demo_buffer = make_demo_buffer(brain_infos, brain_parameters, 1) assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1 + + +def test_load_demo_dir(): + path_prefix = os.path.dirname(os.path.abspath(__file__)) + brain_parameters, brain_infos, total_expected = load_demonstration( + path_prefix + "/test_demo_dir" + ) + assert brain_parameters.brain_name == "Ball3DBrain" + assert brain_parameters.vector_observation_space_size == 8 + assert len(brain_infos) == total_expected + + demo_buffer = make_demo_buffer(brain_infos, brain_parameters, 1) + assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1 diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py index b499df0eb1..1b695788c3 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py @@ -10,6 +10,7 @@ from mlagents.trainers.ppo.models import PPOModel from mlagents.trainers.ppo.trainer import discount_rewards from mlagents.trainers.ppo.policy import PPOPolicy +from mlagents.trainers.demo_loader import make_demo_buffer from mlagents.envs import UnityEnvironment from mlagents.envs.mock_communicator import MockCommunicator @@ -45,11 +46,30 @@ def dummy_config(): ) +@pytest.fixture +def gail_dummy_config(): + return { + "gail": { + "strength": 0.1, + "gamma": 0.9, + "encoding_size": 128, + "demo_path": os.path.dirname(os.path.abspath(__file__)) + "/test.demo", + } + } + + @pytest.fixture def curiosity_dummy_config(): return {"curiosity": {"strength": 0.1, "gamma": 0.9, "encoding_size": 128}} +VECTOR_ACTION_SPACE = [2] +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [2] +BUFFER_INIT_SAMPLES = 20 +NUM_AGENTS = 12 + + def create_ppo_policy_mock( mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual ): @@ -57,26 +77,34 @@ def create_ppo_policy_mock( if not use_visual: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", - vector_action_space_size=[2], - vector_observation_space_size=8, + vector_action_space_size=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_observation_space_size=VECTOR_OBS_SPACE, ) mock_braininfo = mb.create_mock_braininfo( - num_agents=12, - num_vector_observations=8, - num_vector_acts=2, + num_agents=NUM_AGENTS, + num_vector_observations=VECTOR_OBS_SPACE, + num_vector_acts=sum( + DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE + ), discrete=use_discrete, ) else: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", - vector_action_space_size=[2], + vector_action_space_size=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, vector_observation_space_size=0, number_visual_observations=1, ) mock_braininfo = mb.create_mock_braininfo( - num_agents=12, + num_agents=NUM_AGENTS, num_vis_observations=1, - num_vector_acts=2, + num_vector_acts=sum( + DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE + ), discrete=use_discrete, ) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) @@ -92,64 +120,106 @@ def create_ppo_policy_mock( return env, policy -@mock.patch("mlagents.envs.UnityEnvironment") -def test_curiosity_cc_evaluate(mock_env, dummy_config, curiosity_dummy_config): - env, policy = create_ppo_policy_mock( - mock_env, dummy_config, curiosity_dummy_config, False, False, False - ) +def reward_signal_eval(env, policy, reward_signal_name): brain_infos = env.reset() brain_info = brain_infos[env.brain_names[0]] next_brain_info = env.step()[env.brain_names[0]] - scaled_reward, unscaled_reward = policy.reward_signals["curiosity"].evaluate( + # Test evaluate + rsig_result = policy.reward_signals[reward_signal_name].evaluate( brain_info, next_brain_info ) - assert scaled_reward.shape == (12,) - assert unscaled_reward.shape == (12,) + assert rsig_result.scaled_reward.shape == (NUM_AGENTS,) + assert rsig_result.unscaled_reward.shape == (NUM_AGENTS,) + + +def reward_signal_update(env, policy, reward_signal_name): + buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) + out = policy.reward_signals[reward_signal_name].update(buffer.update_buffer, 2) + assert type(out) is dict @mock.patch("mlagents.envs.UnityEnvironment") -def test_curiosity_dc_evaluate(mock_env, dummy_config, curiosity_dummy_config): +def test_gail_cc(mock_env, dummy_config, gail_dummy_config): env, policy = create_ppo_policy_mock( - mock_env, dummy_config, curiosity_dummy_config, False, True, False + mock_env, dummy_config, gail_dummy_config, False, False, False ) - brain_infos = env.reset() - brain_info = brain_infos[env.brain_names[0]] - next_brain_info = env.step()[env.brain_names[0]] - scaled_reward, unscaled_reward = policy.reward_signals["curiosity"].evaluate( - brain_info, next_brain_info + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_dc(mock_env, dummy_config, gail_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, False, True, False ) - assert scaled_reward.shape == (12,) - assert unscaled_reward.shape == (12,) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") @mock.patch("mlagents.envs.UnityEnvironment") -def test_curiosity_visual_evaluate(mock_env, dummy_config, curiosity_dummy_config): +def test_gail_visual(mock_env, dummy_config, gail_dummy_config): + gail_dummy_config["gail"]["demo_path"] = ( + os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo" + ) env, policy = create_ppo_policy_mock( - mock_env, dummy_config, curiosity_dummy_config, False, False, True + mock_env, dummy_config, gail_dummy_config, False, True, True ) - brain_infos = env.reset() - brain_info = brain_infos[env.brain_names[0]] - next_brain_info = env.step()[env.brain_names[0]] - scaled_reward, unscaled_reward = policy.reward_signals["curiosity"].evaluate( - brain_info, next_brain_info + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_gail_rnn(mock_env, dummy_config, gail_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, gail_dummy_config, True, False, False + ) + reward_signal_eval(env, policy, "gail") + reward_signal_update(env, policy, "gail") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_cc(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, False + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_dc(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, True, False + ) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_curiosity_visual(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, True ) - assert scaled_reward.shape == (12,) - assert unscaled_reward.shape == (12,) + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") @mock.patch("mlagents.envs.UnityEnvironment") -def test_curiosity_rnn_evaluate(mock_env, dummy_config, curiosity_dummy_config): +def test_curiosity_rnn(mock_env, dummy_config, curiosity_dummy_config): env, policy = create_ppo_policy_mock( mock_env, dummy_config, curiosity_dummy_config, True, False, False ) - brain_infos = env.reset() - brain_info = brain_infos[env.brain_names[0]] - next_brain_info = env.step()[env.brain_names[0]] - scaled_reward, unscaled_reward = policy.reward_signals["curiosity"].evaluate( - brain_info, next_brain_info + reward_signal_eval(env, policy, "curiosity") + reward_signal_update(env, policy, "curiosity") + + +@mock.patch("mlagents.envs.UnityEnvironment") +def test_extrinsic(mock_env, dummy_config, curiosity_dummy_config): + env, policy = create_ppo_policy_mock( + mock_env, dummy_config, curiosity_dummy_config, False, False, False ) - assert scaled_reward.shape == (12,) - assert unscaled_reward.shape == (12,) + reward_signal_eval(env, policy, "extrinsic") + reward_signal_update(env, policy, "extrinsic") if __name__ == "__main__": diff --git a/ml-agents/mlagents/trainers/tests/testdcvis.demo b/ml-agents/mlagents/trainers/tests/testdcvis.demo new file mode 100644 index 0000000000..b46b1c664b Binary files /dev/null and b/ml-agents/mlagents/trainers/tests/testdcvis.demo differ