From c6f51111657f547724a3d54eab5b0e56b58732a6 Mon Sep 17 00:00:00 2001
From: Ervin T <ervin@unity3d.com>
Date: Fri, 2 Aug 2019 17:10:46 -0700
Subject: [PATCH 1/8] Fix BCTrainer increment_steps (#2384)

---
 ml-agents/mlagents/trainers/bc/trainer.py     |  7 +++--
 .../mlagents/trainers/tests/mock_brain.py     | 20 +++++++++++++
 ml-agents/mlagents/trainers/tests/test_bc.py  | 30 +++++++++++++++++++
 .../mlagents/trainers/tests/test_bcmodule.py  | 29 ++++--------------
 4 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
index 9b2c1553cc..e67d951b9b 100644
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -67,12 +67,13 @@ def get_step(self):
         """
         return self.policy.get_current_step()
 
-    def increment_step(self):
+    def increment_step(self, n_steps: int) -> None:
         """
         Increment the step count of the trainer
+
+        :param n_steps: number of steps to increment the step count by
         """
-        self.policy.increment_step()
-        return
+        self.step = self.policy.increment_step(n_steps)
 
     def add_experiences(
         self,
diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py
index 1fcfbbc710..6303e3a6c0 100644
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
@@ -139,3 +139,23 @@ def create_buffer(brain_infos, brain_params, sequence_length):
 
     buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length)
     return buffer
+
+
+def create_mock_3dball_brain():
+    mock_brain = create_mock_brainparams(
+        vector_action_space_type="continuous",
+        vector_action_space_size=[2],
+        vector_observation_space_size=8,
+    )
+    mock_brain.brain_name = "Ball3DBrain"
+    return mock_brain
+
+
+def create_mock_banana_brain():
+    mock_brain = create_mock_brainparams(
+        number_visual_observations=1,
+        vector_action_space_type="discrete",
+        vector_action_space_size=[3, 3, 3, 2],
+        vector_observation_space_size=0,
+    )
+    return mock_brain
diff --git a/ml-agents/mlagents/trainers/tests/test_bc.py b/ml-agents/mlagents/trainers/tests/test_bc.py
index 800ab4a23f..3d162dbcee 100644
--- a/ml-agents/mlagents/trainers/tests/test_bc.py
+++ b/ml-agents/mlagents/trainers/tests/test_bc.py
@@ -1,12 +1,15 @@
 import unittest.mock as mock
 import pytest
+import os
 
 import numpy as np
 import tensorflow as tf
 import yaml
 
 from mlagents.trainers.bc.models import BehavioralCloningModel
+import mlagents.trainers.tests.mock_brain as mb
 from mlagents.trainers.bc.policy import BCPolicy
+from mlagents.trainers.bc.offline_trainer import BCTrainer
 from mlagents.envs import UnityEnvironment
 from mlagents.envs.mock_communicator import MockCommunicator
 
@@ -21,10 +24,37 @@ def dummy_config():
             use_recurrent: false
             sequence_length: 32
             memory_size: 32
+            batches_per_epoch: 1
+            batch_size: 32
+            summary_freq: 2000
+            max_steps: 4000
             """
     )
 
 
+@mock.patch("mlagents.envs.UnityEnvironment")
+def test_bc_trainer(mock_env, dummy_config):
+    mock_brain = mb.create_mock_3dball_brain()
+    mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8)
+    mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
+    env = mock_env()
+
+    trainer_parameters = dummy_config
+    trainer_parameters["summary_path"] = "tmp"
+    trainer_parameters["model_path"] = "tmp"
+    trainer_parameters["demo_path"] = (
+        os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
+    )
+    trainer = BCTrainer(
+        mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0
+    )
+    trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100)
+    trainer.update_policy()
+    assert len(trainer.stats["Losses/Cloning Loss"]) > 0
+    trainer.increment_step(1)
+    assert trainer.step == 1
+
+
 @mock.patch("mlagents.envs.UnityEnvironment.executable_launcher")
 @mock.patch("mlagents.envs.UnityEnvironment.get_communicator")
 def test_bc_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
index 0eee0f4d2e..d57741448b 100644
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
@@ -42,25 +42,6 @@ def dummy_config():
     )
 
 
-def create_mock_3dball_brain():
-    mock_brain = mb.create_mock_brainparams(
-        vector_action_space_type="continuous",
-        vector_action_space_size=[2],
-        vector_observation_space_size=8,
-    )
-    return mock_brain
-
-
-def create_mock_banana_brain():
-    mock_brain = mb.create_mock_brainparams(
-        number_visual_observations=1,
-        vector_action_space_type="discrete",
-        vector_action_space_size=[3, 3, 3, 2],
-        vector_observation_space_size=0,
-    )
-    return mock_brain
-
-
 def create_ppo_policy_with_bc_mock(
     mock_env, mock_brain, dummy_config, use_rnn, demo_file
 ):
@@ -84,7 +65,7 @@ def create_ppo_policy_with_bc_mock(
 @mock.patch("mlagents.envs.UnityEnvironment")
 def test_bcmodule_defaults(mock_env, dummy_config):
     # See if default values match
-    mock_brain = create_mock_3dball_brain()
+    mock_brain = mb.create_mock_3dball_brain()
     env, policy = create_ppo_policy_with_bc_mock(
         mock_env, mock_brain, dummy_config, False, "test.demo"
     )
@@ -105,7 +86,7 @@ def test_bcmodule_defaults(mock_env, dummy_config):
 # Test with continuous control env and vector actions
 @mock.patch("mlagents.envs.UnityEnvironment")
 def test_bcmodule_update(mock_env, dummy_config):
-    mock_brain = create_mock_3dball_brain()
+    mock_brain = mb.create_mock_3dball_brain()
     env, policy = create_ppo_policy_with_bc_mock(
         mock_env, mock_brain, dummy_config, False, "test.demo"
     )
@@ -118,7 +99,7 @@ def test_bcmodule_update(mock_env, dummy_config):
 # Test with RNN
 @mock.patch("mlagents.envs.UnityEnvironment")
 def test_bcmodule_rnn_update(mock_env, dummy_config):
-    mock_brain = create_mock_3dball_brain()
+    mock_brain = mb.create_mock_3dball_brain()
     env, policy = create_ppo_policy_with_bc_mock(
         mock_env, mock_brain, dummy_config, True, "test.demo"
     )
@@ -131,7 +112,7 @@ def test_bcmodule_rnn_update(mock_env, dummy_config):
 # Test with discrete control and visual observations
 @mock.patch("mlagents.envs.UnityEnvironment")
 def test_bcmodule_dc_visual_update(mock_env, dummy_config):
-    mock_brain = create_mock_banana_brain()
+    mock_brain = mb.create_mock_banana_brain()
     env, policy = create_ppo_policy_with_bc_mock(
         mock_env, mock_brain, dummy_config, False, "testdcvis.demo"
     )
@@ -144,7 +125,7 @@ def test_bcmodule_dc_visual_update(mock_env, dummy_config):
 # Test with discrete control, visual observations and RNN
 @mock.patch("mlagents.envs.UnityEnvironment")
 def test_bcmodule_rnn_dc_update(mock_env, dummy_config):
-    mock_brain = create_mock_banana_brain()
+    mock_brain = mb.create_mock_banana_brain()
     env, policy = create_ppo_policy_with_bc_mock(
         mock_env, mock_brain, dummy_config, True, "testdcvis.demo"
     )

From a84f2267153966ab0e56e1c708f059200d47a0df Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 5 Aug 2019 17:01:16 -0700
Subject: [PATCH 2/8] Increment package ver. Only trainers were changed.

---
 ml-agents/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index c7486140ef..b30f8b4f02 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="mlagents",
-    version="0.9.0",
+    version="0.9.0a",
     description="Unity Machine Learning Agents",
     long_description=long_description,
     long_description_content_type="text/markdown",

From b243314213534eda124607a11c4fe008760a9fa1 Mon Sep 17 00:00:00 2001
From: Ervin T <ervin@unity3d.com>
Date: Tue, 6 Aug 2019 17:08:24 -0700
Subject: [PATCH 3/8] Fix issue with visual obs destroyed too early (#2400)

---
 UnitySDK/Assets/ML-Agents/Scripts/Agent.cs   | 83 +++++++++++---------
 UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs | 19 +++--
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
index 85f74747ea..dec43fb406 100755
--- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
@@ -8,7 +8,7 @@
 namespace MLAgents
 {
     /// <summary>
-    /// Struct that contains all the information for an Agent, including its 
+    /// Struct that contains all the information for an Agent, including its
     /// observations, actions and current status, that is sent to the Brain.
     /// </summary>
     public struct AgentInfo
@@ -120,15 +120,26 @@ public CommunicatorObjects.AgentInfoProto ToProto()
                 agentInfoProto.VisualObservations.Add(
                     ByteString.CopyFrom(obs.EncodeToPNG())
                 );
+            }
+            return agentInfoProto;
+        }
+
+        /// <summary>
+        /// Remove the visual observations from memory. Call at each timestep
+        /// to avoid memory leaks.
+        /// </summary>
+        public void ClearVisualObs()
+        {
+            foreach (Texture2D obs in visualObservations)
+            {
                 Object.Destroy(obs);
             }
             visualObservations.Clear();
-            return agentInfoProto;
         }
     }
 
     /// <summary>
-    /// Struct that contains the action information sent from the Brain to the 
+    /// Struct that contains the action information sent from the Brain to the
     /// Agent.
     /// </summary>
     public struct AgentAction
@@ -141,7 +152,7 @@ public struct AgentAction
     }
 
     /// <summary>
-    /// Struct that contains all the Agent-specific parameters provided in the 
+    /// Struct that contains all the Agent-specific parameters provided in the
     /// Editor. This excludes the Brain linked to the Agent since it can be
     /// modified programmatically.
     /// </summary>
@@ -153,7 +164,7 @@ public class AgentParameters
         /// observations.
         /// </summary>
         public List<Camera> agentCameras = new List<Camera>();
-        
+
         /// <summary>
         /// The list of the RenderTextures the agent uses for visual
         /// observations.
@@ -162,7 +173,7 @@ public class AgentParameters
 
 
         /// <summary>
-        /// The maximum number of steps the agent takes before being done. 
+        /// The maximum number of steps the agent takes before being done.
         /// </summary>
         /// <remarks>
         /// If set to 0, the agent can only be set to done programmatically (or
@@ -184,7 +195,7 @@ public class AgentParameters
         public bool resetOnDone = true;
 
         /// <summary>
-        /// Whether to enable On Demand Decisions or make a decision at 
+        /// Whether to enable On Demand Decisions or make a decision at
         /// every step.
         /// </summary>
         public bool onDemandDecision;
@@ -199,8 +210,8 @@ public class AgentParameters
 
     /// <summary>
     /// Agent Monobehavior class that is attached to a Unity GameObject, making it
-    /// an Agent. An agent produces observations and takes actions in the 
-    /// environment. Observations are determined by the cameras attached 
+    /// an Agent. An agent produces observations and takes actions in the
+    /// environment. Observations are determined by the cameras attached
     /// to the agent in addition to the vector observations implemented by the
     /// user in <see cref="CollectObservations"/>. On the other hand, actions
     /// are determined by decisions produced by a linked Brain. Currently, this
@@ -213,23 +224,23 @@ public class AgentParameters
     /// however, an agent need not send its observation at every step since very
     /// little may have changed between sucessive steps. Currently, how often an
     /// agent updates its brain with a fresh observation is determined by the
-    /// Academy. 
-    /// 
-    /// At any step, an agent may be considered <see cref="done"/>. 
+    /// Academy.
+    ///
+    /// At any step, an agent may be considered <see cref="done"/>.
     /// This could occur due to a variety of reasons:
     ///     - The agent reached an end state within its environment.
     ///     - The agent reached the maximum # of steps (i.e. timed out).
     ///     - The academy reached the maximum # of steps (forced agent to be done).
-    /// 
+    ///
     /// Here, an agent reaches an end state if it completes its task successfully
     /// or somehow fails along the way. In the case where an agent is done before
     /// the academy, it either resets and restarts, or just lingers until the
     /// academy is done.
-    /// 
+    ///
     /// An important note regarding steps and episodes is due. Here, an agent step
     /// corresponds to an academy step, which also corresponds to Unity
     /// environment step (i.e. each FixedUpdate call). This is not the case for
-    /// episodes. The academy controls the global episode count and each agent 
+    /// episodes. The academy controls the global episode count and each agent
     /// controls its own local episode count and can reset and start a new local
     /// episode independently (based on its own experience). Thus an academy
     /// (global) episode can be viewed as the upper-bound on an agents episode
@@ -237,10 +248,10 @@ public class AgentParameters
     /// multiple local episodes. Consequently, if an agent max step is
     /// set to a value larger than the academy max steps value, then the academy
     /// value takes precedence (since the agent max step will never be reached).
-    /// 
+    ///
     /// Lastly, note that at any step the brain linked to the agent is allowed to
     /// change programmatically with <see cref="GiveBrain"/>.
-    /// 
+    ///
     /// Implementation-wise, it is required that this class is extended and the
     /// virtual methods overridden. For sample implementations of agent behavior,
     /// see the Examples/ directory within this Unity project.
@@ -252,7 +263,7 @@ public abstract class Agent : MonoBehaviour
     {
         /// <summary>
         /// The Brain attached to this agent. A brain can be attached either
-        /// directly from the Editor through AgentEditor or 
+        /// directly from the Editor through AgentEditor or
         /// programmatically through <see cref="GiveBrain"/>. It is OK for an agent
         /// to not have a brain, as long as no decision is requested.
         /// </summary>
@@ -523,7 +534,7 @@ void ResetData()
             actionMasker = new ActionMasker(param);
             // If we haven't initialized vectorActions, initialize to 0. This should only
             // happen during the creation of the Agent. In subsequent episodes, vectorAction
-            // should stay the previous action before the Done(), so that it is properly recorded. 
+            // should stay the previous action before the Done(), so that it is properly recorded.
             if (action.vectorActions == null)
             {
                 if (param.vectorActionSpaceType == SpaceType.continuous)
@@ -598,9 +609,9 @@ void SendInfoToBrain()
                     brain.brainParameters.vectorObservationSize,
                     info.vectorObservation.Count));
             }
-            
+
             Utilities.ShiftLeft(info.stackedVectorObservation, param.vectorObservationSize);
-            Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation, 
+            Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation,
                                     info.stackedVectorObservation.Count - info.vectorObservation.Count);
 
             info.visualObservations.Clear();
@@ -624,7 +635,7 @@ void SendInfoToBrain()
                     param.cameraResolutions[i].height);
                 info.visualObservations.Add(obsTexture);
             }
-            
+
             //Then add all renderTextures
             var camCount = agentParameters.agentCameras.Count;
             for (int i = 0; i < agentParameters.agentRenderTextures.Count; i++)
@@ -653,13 +664,13 @@ void SendInfoToBrain()
 
         /// <summary>
         /// Collects the (vector, visual, text) observations of the agent.
-        /// The agent observation describes the current environment from the 
+        /// The agent observation describes the current environment from the
         /// perspective of the agent.
         /// </summary>
         /// <remarks>
         /// Simply, an agents observation is any environment information that helps
         /// the Agent acheive its goal. For example, for a fighting Agent, its
-        /// observation could include distances to friends or enemies, or the 
+        /// observation could include distances to friends or enemies, or the
         /// current level of ammunition at its disposal.
         /// Recall that an Agent may attach vector, visual or textual observations.
         /// Vector observations are added by calling the provided helper methods:
@@ -678,7 +689,7 @@ void SendInfoToBrain()
         /// needs to match the vectorObservationSize attribute of the linked Brain.
         /// Visual observations are implicitly added from the cameras attached to
         /// the Agent.
-        /// Lastly, textual observations are added using 
+        /// Lastly, textual observations are added using
         /// <see cref="SetTextObs(string)"/>.
         /// </remarks>
         public virtual void CollectObservations()
@@ -861,7 +872,7 @@ public virtual void AgentAction(float[] vectorAction, string textAction, Communi
         }
 
         /// <summary>
-        /// Specifies the agent behavior when done and 
+        /// Specifies the agent behavior when done and
         /// <see cref="AgentParameters.resetOnDone"/> is false. This method can be
         /// used to remove the agent from the scene.
         /// </summary>
@@ -906,12 +917,12 @@ public void UpdateMemoriesAction(List<float> memories)
         {
             action.memories = memories;
         }
-        
+
         public void AppendMemoriesAction(List<float> memories)
         {
             action.memories.AddRange(memories);
         }
-        
+
         public List<float> GetMemoriesAction()
         {
             return action.memories;
@@ -966,9 +977,9 @@ protected float ScaleAction(float rawAction, float min, float max)
         /// <summary>
         /// Sets the status of the agent.
         /// </summary>
-        /// <param name="academyMaxStep">If set to <c>true</c> 
+        /// <param name="academyMaxStep">If set to <c>true</c>
         /// The agent must set maxStepReached.</param>
-        /// <param name="academyDone">If set to <c>true</c> 
+        /// <param name="academyDone">If set to <c>true</c>
         /// The agent must set done.</param>
         /// <param name="academyStepCounter">Number of current steps in episode</param>
         void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
@@ -984,7 +995,7 @@ void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
                 maxStepReached = true;
             }
 
-            // If the Academy needs to reset, the agent should reset 
+            // If the Academy needs to reset, the agent should reset
             // even if it reseted recently.
             if (academyDone)
             {
@@ -996,7 +1007,7 @@ void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
         /// Signals the agent that it must reset if its done flag is set to true.
         void ResetIfDone()
         {
-            // If an agent is done, then it will also 
+            // If an agent is done, then it will also
             // request for a decision and an action
             if (IsDone())
             {
@@ -1126,14 +1137,14 @@ public static Texture2D ObservationToTexture(Camera obsCamera, int width, int he
             obsCamera.Render();
 
             texture2D.ReadPixels(new Rect(0, 0, texture2D.width, texture2D.height), 0, 0);
-            
+
             obsCamera.targetTexture = prevCameraRT;
             obsCamera.rect = oldRec;
             RenderTexture.active = prevActiveRT;
             RenderTexture.ReleaseTemporary(tempRT);
             return texture2D;
         }
-        
+
         /// <summary>
         /// Converts a RenderTexture and correspinding resolution to a 2D texture.
         /// </summary>
@@ -1150,7 +1161,7 @@ public static Texture2D ObservationToTexture(RenderTexture obsTexture, int width
             {
                 texture2D.Resize(width, height);
             }
-            
+
             if(width != obsTexture.width || height != obsTexture.height)
             {
                 throw new UnityAgentsException(string.Format(
@@ -1175,5 +1186,5 @@ public void SetCustomObservation(CustomObservation customObservation)
         {
             info.customObservation = customObservation;
         }
-    }    
+    }
 }
diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs b/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
index a574b7f8f2..30f6495584 100644
--- a/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
@@ -6,15 +6,15 @@
 namespace MLAgents
 {
     /// <summary>
-    /// The batcher is an RL specific class that makes sure that the information each object in 
-    /// Unity (Academy and Brains) wants to send to External is appropriately batched together 
+    /// The batcher is an RL specific class that makes sure that the information each object in
+    /// Unity (Academy and Brains) wants to send to External is appropriately batched together
     /// and sent only when necessary.
-    /// 
+    ///
     /// The Batcher will only send a Message to the Communicator when either :
     ///     1 - The academy is done
     ///     2 - At least one brain has data to send
-    /// 
-    /// At each step, the batcher will keep track of the brains that queried the batcher for that 
+    ///
+    /// At each step, the batcher will keep track of the brains that queried the batcher for that
     /// step. The batcher can only send the batched data when all the Brains have queried the
     /// Batcher.
     /// </summary>
@@ -67,7 +67,7 @@ public Batcher(Communicator communicator)
         }
 
         /// <summary>
-        /// Sends the academy parameters through the Communicator. 
+        /// Sends the academy parameters through the Communicator.
         /// Is used by the academy to send the AcademyParameters to the communicator.
         /// </summary>
         /// <returns>The External Initialization Parameters received.</returns>
@@ -104,7 +104,7 @@ public CommunicatorObjects.UnityRLInitializationInput SendAcademyParameters(
         /// Registers the done flag of the academy to the next output to be sent
         /// to the communicator.
         /// </summary>
-        /// <param name="done">If set to <c>true</c> 
+        /// <param name="done">If set to <c>true</c>
         /// The academy done state will be sent to External at the next Exchange.</param>
         public void RegisterAcademyDoneFlag(bool done)
         {
@@ -164,7 +164,7 @@ public void SubscribeBrain(string brainKey)
 
         /// <summary>
         /// Sends the brain info. If at least one brain has an agent in need of
-        /// a decision or if the academy is done, the data is sent via 
+        /// a decision or if the academy is done, the data is sent via
         /// Communicator. Else, a new step is realized. The data can only be
         /// sent once all the brains that subscribed to the batcher have tried
         /// to send information.
@@ -198,6 +198,9 @@ public void SendBrainInfo(
                 {
                     CommunicatorObjects.AgentInfoProto agentInfoProto = agentInfo[agent].ToProto();
                     m_currentUnityRLOutput.AgentInfos[brainKey].Value.Add(agentInfoProto);
+                    // Avoid visual obs memory leak. This should be called AFTER we are done with the visual obs.
+                    // e.g. after recording them to demo and using them for inference.
+                    agentInfo[agent].ClearVisualObs();
                 }
 
                 m_hasData[brainKey] = true;

From 2a76490f92120e91d5aeb176de7c6a47666973bb Mon Sep 17 00:00:00 2001
From: sankalp04 <30798796+sankalp04@users.noreply.github.com>
Date: Thu, 8 Aug 2019 14:08:31 -0700
Subject: [PATCH 4/8] =?UTF-8?q?Change=20samplers=20to=20use=20random=20sta?=
 =?UTF-8?q?te=20to=20allow=20consistency=20in=20reset=20par=E2=80=A6=20(#2?=
 =?UTF-8?q?398)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Change samplers to use random state to allow consistency in reset parameter draws for a specified seed
---
 ml-agents-envs/mlagents/envs/sampler_class.py | 86 ++++++++++++++++---
 ml-agents/mlagents/trainers/learn.py          |  6 +-
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/ml-agents-envs/mlagents/envs/sampler_class.py b/ml-agents-envs/mlagents/envs/sampler_class.py
index cfff08253d..2fbd83d6fe 100644
--- a/ml-agents-envs/mlagents/envs/sampler_class.py
+++ b/ml-agents-envs/mlagents/envs/sampler_class.py
@@ -19,13 +19,27 @@ class UniformSampler(Sampler):
     """
 
     def __init__(
-        self, min_value: Union[int, float], max_value: Union[int, float], **kwargs
+        self,
+        min_value: Union[int, float],
+        max_value: Union[int, float],
+        seed: Optional[int] = None,
+        **kwargs
     ) -> None:
+        """
+        :param min_value: minimum value of the range to be sampled uniformly from
+        :param max_value: maximum value of the range to be sampled uniformly from
+        :param seed: Random seed used for making draws from the uniform sampler
+        """
         self.min_value = min_value
         self.max_value = max_value
+        # Draw from random state to allow for consistent reset parameter draw for a seed
+        self.random_state = np.random.RandomState(seed)
 
     def sample_parameter(self) -> float:
-        return np.random.uniform(self.min_value, self.max_value)
+        """
+        Draws and returns a sample from the specified interval
+        """
+        return self.random_state.uniform(self.min_value, self.max_value)
 
 
 class MultiRangeUniformSampler(Sampler):
@@ -36,19 +50,33 @@ class MultiRangeUniformSampler(Sampler):
     it proceeds to pick a value uniformly in that range.
     """
 
-    def __init__(self, intervals: List[List[Union[int, float]]], **kwargs) -> None:
+    def __init__(
+        self,
+        intervals: List[List[Union[int, float]]],
+        seed: Optional[int] = None,
+        **kwargs
+    ) -> None:
+        """
+        :param intervals: List of intervals to draw uniform samples from
+        :param seed: Random seed used for making uniform draws from the specified intervals
+        """
         self.intervals = intervals
         # Measure the length of the intervals
         interval_lengths = [abs(x[1] - x[0]) for x in self.intervals]
         cum_interval_length = sum(interval_lengths)
         # Assign weights to an interval proportionate to the interval size
         self.interval_weights = [x / cum_interval_length for x in interval_lengths]
+        # Draw from random state to allow for consistent reset parameter draw for a seed
+        self.random_state = np.random.RandomState(seed)
 
     def sample_parameter(self) -> float:
+        """
+        Selects an interval to pick and then draws a uniform sample from the picked interval
+        """
         cur_min, cur_max = self.intervals[
-            np.random.choice(len(self.intervals), p=self.interval_weights)
+            self.random_state.choice(len(self.intervals), p=self.interval_weights)
         ]
-        return np.random.uniform(cur_min, cur_max)
+        return self.random_state.uniform(cur_min, cur_max)
 
 
 class GaussianSampler(Sampler):
@@ -58,13 +86,27 @@ class GaussianSampler(Sampler):
     """
 
     def __init__(
-        self, mean: Union[float, int], st_dev: Union[float, int], **kwargs
+        self,
+        mean: Union[float, int],
+        st_dev: Union[float, int],
+        seed: Optional[int] = None,
+        **kwargs
     ) -> None:
+        """
+        :param mean: Specifies the mean of the gaussian distribution to draw from
+        :param st_dev: Specifies the standard devation of the gaussian distribution to draw from
+        :param seed: Random seed used for making gaussian draws from the sample
+        """
         self.mean = mean
         self.st_dev = st_dev
+        # Draw from random state to allow for consistent reset parameter draw for a seed
+        self.random_state = np.random.RandomState(seed)
 
     def sample_parameter(self) -> float:
-        return np.random.normal(self.mean, self.st_dev)
+        """
+        Returns a draw from the specified Gaussian distribution
+        """
+        return self.random_state.normal(self.mean, self.st_dev)
 
 
 class SamplerFactory:
@@ -81,10 +123,23 @@ class SamplerFactory:
 
     @staticmethod
     def register_sampler(name: str, sampler_cls: Type[Sampler]) -> None:
+        """
+        Registers the sampe in the Sampler Factory to be used later
+        :param name: String name to set as key for the sampler_cls in the factory
+        :param sampler_cls: Sampler object to associate to the name in the factory
+        """
         SamplerFactory.NAME_TO_CLASS[name] = sampler_cls
 
     @staticmethod
-    def init_sampler_class(name: str, params: Dict[str, Any]):
+    def init_sampler_class(
+        name: str, params: Dict[str, Any], seed: Optional[int] = None
+    ) -> Sampler:
+        """
+        Initializes the sampler class associated with the name with the params
+        :param name: Name of the sampler in the factory to initialize
+        :param params: Parameters associated to the sampler attached to the name
+        :param seed: Random seed to be used to set deterministic random draws for the sampler
+        """
         if name not in SamplerFactory.NAME_TO_CLASS:
             raise SamplerException(
                 name + " sampler is not registered in the SamplerFactory."
@@ -92,6 +147,7 @@ def init_sampler_class(name: str, params: Dict[str, Any]):
                 " associated to your sampler in the SamplerFactory."
             )
         sampler_cls = SamplerFactory.NAME_TO_CLASS[name]
+        params["seed"] = seed
         try:
             return sampler_cls(**params)
         except TypeError:
@@ -103,7 +159,13 @@ def init_sampler_class(name: str, params: Dict[str, Any]):
 
 
 class SamplerManager:
-    def __init__(self, reset_param_dict: Dict[str, Any]) -> None:
+    def __init__(
+        self, reset_param_dict: Dict[str, Any], seed: Optional[int] = None
+    ) -> None:
+        """
+        :param reset_param_dict: Arguments needed for initializing the samplers
+        :param seed: Random seed to be used for drawing samples from the samplers
+        """
         self.reset_param_dict = reset_param_dict if reset_param_dict else {}
         assert isinstance(self.reset_param_dict, dict)
         self.samplers: Dict[str, Sampler] = {}
@@ -116,7 +178,7 @@ def __init__(self, reset_param_dict: Dict[str, Any]) -> None:
                 )
             sampler_name = cur_param_dict.pop("sampler-type")
             param_sampler = SamplerFactory.init_sampler_class(
-                sampler_name, cur_param_dict
+                sampler_name, cur_param_dict, seed
             )
 
             self.samplers[param_name] = param_sampler
@@ -128,6 +190,10 @@ def is_empty(self) -> bool:
         return not bool(self.samplers)
 
     def sample_all(self) -> Dict[str, float]:
+        """
+        Loop over all samplers and draw a sample from each one for generating
+        next set of reset parameter values.
+        """
         res = {}
         for param_name, param_sampler in list(self.samplers.items()):
             res[param_name] = param_sampler.sample_parameter()
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index c1ca879107..03398c7788 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -91,7 +91,7 @@ def run_training(
     env = SubprocessEnvManager(env_factory, num_envs)
     maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env)
     sampler_manager, resampling_interval = create_sampler_manager(
-        sampler_file_path, env.reset_parameters
+        sampler_file_path, env.reset_parameters, run_seed
     )
 
     # Create controller and begin training.
@@ -118,7 +118,7 @@ def run_training(
     tc.start_learning(env, trainer_config)
 
 
-def create_sampler_manager(sampler_file_path, env_reset_params):
+def create_sampler_manager(sampler_file_path, env_reset_params, run_seed=None):
     sampler_config = None
     resample_interval = None
     if sampler_file_path is not None:
@@ -136,7 +136,7 @@ def create_sampler_manager(sampler_file_path, env_reset_params):
                 "Resampling interval was not specified in the sampler file."
                 " Please specify it with the 'resampling-interval' key in the sampler config file."
             )
-    sampler_manager = SamplerManager(sampler_config)
+    sampler_manager = SamplerManager(sampler_config, run_seed)
     return sampler_manager, resample_interval
 
 

From 8f6d0f824551dae57cd7d05c72862f658ef2a96c Mon Sep 17 00:00:00 2001
From: Ervin T <ervin@unity3d.com>
Date: Fri, 9 Aug 2019 15:03:28 -0700
Subject: [PATCH 5/8] Fix naming conflict between Curiosity and GAIL (#2406)

---
 .../reward_signals/curiosity/model.py         | 12 +++++------
 .../components/reward_signals/gail/model.py   | 20 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
index fd60b6306f..009e2663df 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
@@ -42,7 +42,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
                 # Create input ops for next (t+1) visual observations.
                 next_visual_input = LearningModel.create_visual_input(
                     self.policy_model.brain.camera_resolutions[i],
-                    name="next_visual_observation_" + str(i),
+                    name="curiosity_next_visual_observation_" + str(i),
                 )
                 self.next_visual_in.append(next_visual_input)
 
@@ -53,7 +53,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
                     self.encoding_size,
                     LearningModel.swish,
                     1,
-                    "stream_{}_visual_obs_encoder".format(i),
+                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                     False,
                 )
 
@@ -62,7 +62,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
                     self.encoding_size,
                     LearningModel.swish,
                     1,
-                    "stream_{}_visual_obs_encoder".format(i),
+                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                     True,
                 )
                 visual_encoders.append(encoded_visual)
@@ -80,7 +80,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
             self.next_vector_in = tf.placeholder(
                 shape=[None, self.policy_model.vec_obs_size],
                 dtype=tf.float32,
-                name="next_vector_observation",
+                name="curiosity_next_vector_observation",
             )
 
             encoded_vector_obs = self.policy_model.create_vector_observation_encoder(
@@ -88,7 +88,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
                 self.encoding_size,
                 LearningModel.swish,
                 2,
-                "vector_obs_encoder",
+                "curiosity_vector_obs_encoder",
                 False,
             )
             encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder(
@@ -96,7 +96,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
                 self.encoding_size,
                 LearningModel.swish,
                 2,
-                "vector_obs_encoder",
+                "curiosity_vector_obs_encoder",
                 True,
             )
             encoded_state_list.append(encoded_vector_obs)
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
index e26d969da3..c652d3aeac 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
@@ -112,7 +112,7 @@ def make_inputs(self) -> None:
                 # Create input ops for next (t+1) visual observations.
                 visual_input = self.policy_model.create_visual_input(
                     self.policy_model.brain.camera_resolutions[i],
-                    name="visual_observation_" + str(i),
+                    name="gail_visual_observation_" + str(i),
                 )
                 self.expert_visual_in.append(visual_input)
 
@@ -121,7 +121,7 @@ def make_inputs(self) -> None:
                     self.encoding_size,
                     LearningModel.swish,
                     1,
-                    "stream_{}_visual_obs_encoder".format(i),
+                    "gail_stream_{}_visual_obs_encoder".format(i),
                     False,
                 )
 
@@ -130,7 +130,7 @@ def make_inputs(self) -> None:
                     self.encoding_size,
                     LearningModel.swish,
                     1,
-                    "stream_{}_visual_obs_encoder".format(i),
+                    "gail_stream_{}_visual_obs_encoder".format(i),
                     True,
                 )
                 visual_policy_encoders.append(encoded_policy_visual)
@@ -163,7 +163,7 @@ def create_encoder(
                 concat_input,
                 self.h_size,
                 activation=LearningModel.swish,
-                name="d_hidden_1",
+                name="gail_d_hidden_1",
                 reuse=reuse,
             )
 
@@ -171,7 +171,7 @@ def create_encoder(
                 hidden_1,
                 self.h_size,
                 activation=LearningModel.swish,
-                name="d_hidden_2",
+                name="gail_d_hidden_2",
                 reuse=reuse,
             )
 
@@ -182,7 +182,7 @@ def create_encoder(
                     hidden_2,
                     self.z_size,
                     reuse=reuse,
-                    name="z_mean",
+                    name="gail_z_mean",
                     kernel_initializer=LearningModel.scaled_init(0.01),
                 )
 
@@ -198,7 +198,7 @@ def create_encoder(
                 estimate_input,
                 1,
                 activation=tf.nn.sigmoid,
-                name="d_estimate",
+                name="gail_d_estimate",
                 reuse=reuse,
             )
             return estimate, z_mean, concat_input
@@ -209,7 +209,7 @@ def create_network(self) -> None:
         """
         if self.use_vail:
             self.z_sigma = tf.get_variable(
-                "sigma_vail",
+                "gail_sigma_vail",
                 self.z_size,
                 dtype=tf.float32,
                 initializer=tf.ones_initializer(),
@@ -217,7 +217,7 @@ def create_network(self) -> None:
             self.z_sigma_sq = self.z_sigma * self.z_sigma
             self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
             self.use_noise = tf.placeholder(
-                shape=[1], dtype=tf.float32, name="NoiseLevel"
+                shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
             )
         self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
             self.encoded_expert, self.expert_action, self.done_expert, reuse=False
@@ -229,7 +229,7 @@ def create_network(self) -> None:
             reuse=True,
         )
         self.discriminator_score = tf.reshape(
-            self.policy_estimate, [-1], name="GAIL_reward"
+            self.policy_estimate, [-1], name="gail_reward"
         )
         self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
 

From de728e57c785a056823102e425c80f4bbfd6fc51 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 9 Aug 2019 17:03:40 -0700
Subject: [PATCH 6/8] Tick version to 0.9.0a

---
 gym-unity/setup.py      | 4 ++--
 ml-agents-envs/setup.py | 2 +-
 ml-agents/setup.py      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gym-unity/setup.py b/gym-unity/setup.py
index f5ccf499c3..5f64c6ea76 100755
--- a/gym-unity/setup.py
+++ b/gym-unity/setup.py
@@ -4,12 +4,12 @@
 
 setup(
     name="gym_unity",
-    version="0.4.3",
+    version="0.4.3a",
     description="Unity Machine Learning Agents Gym Interface",
     license="Apache License 2.0",
     author="Unity Technologies",
     author_email="ML-Agents@unity3d.com",
     url="https://github.com/Unity-Technologies/ml-agents",
     packages=find_packages(),
-    install_requires=["gym", "mlagents_envs==0.9.0"],
+    install_requires=["gym", "mlagents_envs==0.9.0a"],
 )
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index 6edacf1721..9c440ebbcb 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="mlagents_envs",
-    version="0.9.0",
+    version="0.9.0a",
     description="Unity Machine Learning Agents Interface",
     url="https://github.com/Unity-Technologies/ml-agents",
     author="Unity Technologies",
diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index b30f8b4f02..910c52c630 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -29,7 +29,7 @@
     ),
     zip_safe=False,
     install_requires=[
-        "mlagents_envs==0.9.0",
+        "mlagents_envs==0.9.0a",
         "tensorflow>=1.7,<1.8",
         "Pillow>=4.2.1",
         "matplotlib",

From 518c5eba7eeb35a52774162b94fd19fb7ba4fdb5 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 9 Aug 2019 17:38:04 -0700
Subject: [PATCH 7/8] Re-tick version for pypi

---
 gym-unity/setup.py      | 4 ++--
 ml-agents-envs/setup.py | 2 +-
 ml-agents/setup.py      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gym-unity/setup.py b/gym-unity/setup.py
index 5f64c6ea76..64cb242bd6 100755
--- a/gym-unity/setup.py
+++ b/gym-unity/setup.py
@@ -4,12 +4,12 @@
 
 setup(
     name="gym_unity",
-    version="0.4.3a",
+    version="0.4.4",
     description="Unity Machine Learning Agents Gym Interface",
     license="Apache License 2.0",
     author="Unity Technologies",
     author_email="ML-Agents@unity3d.com",
     url="https://github.com/Unity-Technologies/ml-agents",
     packages=find_packages(),
-    install_requires=["gym", "mlagents_envs==0.9.0a"],
+    install_requires=["gym", "mlagents_envs==0.9.1"],
 )
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index 9c440ebbcb..06d8131546 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="mlagents_envs",
-    version="0.9.0a",
+    version="0.9.1",
     description="Unity Machine Learning Agents Interface",
     url="https://github.com/Unity-Technologies/ml-agents",
     author="Unity Technologies",
diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index 910c52c630..17174ebe38 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="mlagents",
-    version="0.9.0a",
+    version="0.9.1",
     description="Unity Machine Learning Agents",
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -29,7 +29,7 @@
     ),
     zip_safe=False,
     install_requires=[
-        "mlagents_envs==0.9.0a",
+        "mlagents_envs==0.9.1",
         "tensorflow>=1.7,<1.8",
         "Pillow>=4.2.1",
         "matplotlib",

From 807e26033068da8729a55e30c81b3439e107193a Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Fri, 9 Aug 2019 18:01:12 -0700
Subject: [PATCH 8/8] Make sure all tests pass on BC

---
 ml-agents/mlagents/trainers/bc/policy.py  | 16 +++-------------
 ml-agents/mlagents/trainers/bc/trainer.py |  2 +-
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/ml-agents/mlagents/trainers/bc/policy.py b/ml-agents/mlagents/trainers/bc/policy.py
index 10faf7fab9..2c31a19dec 100644
--- a/ml-agents/mlagents/trainers/bc/policy.py
+++ b/ml-agents/mlagents/trainers/bc/policy.py
@@ -79,24 +79,14 @@ def update(self, mini_batch, num_sequences):
             self.model.sequence_length: self.sequence_length,
         }
         if self.use_continuous_act:
-            feed_dict[self.model.true_action] = mini_batch["actions"].reshape(
-                [-1, self.brain.vector_action_space_size[0]]
-            )
+            feed_dict[self.model.true_action] = mini_batch["actions"]
         else:
-            feed_dict[self.model.true_action] = mini_batch["actions"].reshape(
-                [-1, len(self.brain.vector_action_space_size)]
-            )
+            feed_dict[self.model.true_action] = mini_batch["actions"]
             feed_dict[self.model.action_masks] = np.ones(
                 (num_sequences, sum(self.brain.vector_action_space_size))
             )
         if self.use_vec_obs:
-            apparent_obs_size = (
-                self.brain.vector_observation_space_size
-                * self.brain.num_stacked_vector_observations
-            )
-            feed_dict[self.model.vector_in] = mini_batch["vector_obs"].reshape(
-                [-1, apparent_obs_size]
-            )
+            feed_dict[self.model.vector_in] = mini_batch["vector_obs"]
         for i, _ in enumerate(self.model.visual_in):
             visual_obs = mini_batch["visual_obs%d" % i]
             feed_dict[self.model.visual_in[i]] = visual_obs
diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
index 59da9fa34f..7f53bcacf4 100644
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -124,7 +124,7 @@ def update_policy(self):
         """
         Updates the policy.
         """
-        self.demonstration_buffer.update_buffer.shuffle()
+        self.demonstration_buffer.update_buffer.shuffle(self.policy.sequence_length)
         batch_losses = []
         num_batches = min(
             len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences,