diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
index 85f74747ea..dec43fb406 100755
--- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
@@ -8,7 +8,7 @@
namespace MLAgents
{
///
- /// Struct that contains all the information for an Agent, including its
+ /// Struct that contains all the information for an Agent, including its
/// observations, actions and current status, that is sent to the Brain.
///
public struct AgentInfo
@@ -120,15 +120,26 @@ public CommunicatorObjects.AgentInfoProto ToProto()
agentInfoProto.VisualObservations.Add(
ByteString.CopyFrom(obs.EncodeToPNG())
);
+ }
+ return agentInfoProto;
+ }
+
+ ///
+ /// Remove the visual observations from memory. Call at each timestep
+ /// to avoid memory leaks.
+ ///
+ public void ClearVisualObs()
+ {
+ foreach (Texture2D obs in visualObservations)
+ {
Object.Destroy(obs);
}
visualObservations.Clear();
- return agentInfoProto;
}
}
///
- /// Struct that contains the action information sent from the Brain to the
+ /// Struct that contains the action information sent from the Brain to the
/// Agent.
///
public struct AgentAction
@@ -141,7 +152,7 @@ public struct AgentAction
}
///
- /// Struct that contains all the Agent-specific parameters provided in the
+ /// Struct that contains all the Agent-specific parameters provided in the
/// Editor. This excludes the Brain linked to the Agent since it can be
/// modified programmatically.
///
@@ -153,7 +164,7 @@ public class AgentParameters
/// observations.
///
public List agentCameras = new List();
-
+
///
/// The list of the RenderTextures the agent uses for visual
/// observations.
@@ -162,7 +173,7 @@ public class AgentParameters
///
- /// The maximum number of steps the agent takes before being done.
+ /// The maximum number of steps the agent takes before being done.
///
///
/// If set to 0, the agent can only be set to done programmatically (or
@@ -184,7 +195,7 @@ public class AgentParameters
public bool resetOnDone = true;
///
- /// Whether to enable On Demand Decisions or make a decision at
+ /// Whether to enable On Demand Decisions or make a decision at
/// every step.
///
public bool onDemandDecision;
@@ -199,8 +210,8 @@ public class AgentParameters
///
/// Agent Monobehavior class that is attached to a Unity GameObject, making it
- /// an Agent. An agent produces observations and takes actions in the
- /// environment. Observations are determined by the cameras attached
+ /// an Agent. An agent produces observations and takes actions in the
+ /// environment. Observations are determined by the cameras attached
/// to the agent in addition to the vector observations implemented by the
/// user in . On the other hand, actions
/// are determined by decisions produced by a linked Brain. Currently, this
@@ -213,23 +224,23 @@ public class AgentParameters
/// however, an agent need not send its observation at every step since very
/// little may have changed between sucessive steps. Currently, how often an
/// agent updates its brain with a fresh observation is determined by the
- /// Academy.
- ///
- /// At any step, an agent may be considered .
+ /// Academy.
+ ///
+ /// At any step, an agent may be considered .
/// This could occur due to a variety of reasons:
/// - The agent reached an end state within its environment.
/// - The agent reached the maximum # of steps (i.e. timed out).
/// - The academy reached the maximum # of steps (forced agent to be done).
- ///
+ ///
/// Here, an agent reaches an end state if it completes its task successfully
/// or somehow fails along the way. In the case where an agent is done before
/// the academy, it either resets and restarts, or just lingers until the
/// academy is done.
- ///
+ ///
/// An important note regarding steps and episodes is due. Here, an agent step
/// corresponds to an academy step, which also corresponds to Unity
/// environment step (i.e. each FixedUpdate call). This is not the case for
- /// episodes. The academy controls the global episode count and each agent
+ /// episodes. The academy controls the global episode count and each agent
/// controls its own local episode count and can reset and start a new local
/// episode independently (based on its own experience). Thus an academy
/// (global) episode can be viewed as the upper-bound on an agents episode
@@ -237,10 +248,10 @@ public class AgentParameters
/// multiple local episodes. Consequently, if an agent max step is
/// set to a value larger than the academy max steps value, then the academy
/// value takes precedence (since the agent max step will never be reached).
- ///
+ ///
/// Lastly, note that at any step the brain linked to the agent is allowed to
/// change programmatically with .
- ///
+ ///
/// Implementation-wise, it is required that this class is extended and the
/// virtual methods overridden. For sample implementations of agent behavior,
/// see the Examples/ directory within this Unity project.
@@ -252,7 +263,7 @@ public abstract class Agent : MonoBehaviour
{
///
/// The Brain attached to this agent. A brain can be attached either
- /// directly from the Editor through AgentEditor or
+ /// directly from the Editor through AgentEditor or
/// programmatically through . It is OK for an agent
/// to not have a brain, as long as no decision is requested.
///
@@ -523,7 +534,7 @@ void ResetData()
actionMasker = new ActionMasker(param);
// If we haven't initialized vectorActions, initialize to 0. This should only
// happen during the creation of the Agent. In subsequent episodes, vectorAction
- // should stay the previous action before the Done(), so that it is properly recorded.
+ // should stay the previous action before the Done(), so that it is properly recorded.
if (action.vectorActions == null)
{
if (param.vectorActionSpaceType == SpaceType.continuous)
@@ -598,9 +609,9 @@ void SendInfoToBrain()
brain.brainParameters.vectorObservationSize,
info.vectorObservation.Count));
}
-
+
Utilities.ShiftLeft(info.stackedVectorObservation, param.vectorObservationSize);
- Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation,
+ Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation,
info.stackedVectorObservation.Count - info.vectorObservation.Count);
info.visualObservations.Clear();
@@ -624,7 +635,7 @@ void SendInfoToBrain()
param.cameraResolutions[i].height);
info.visualObservations.Add(obsTexture);
}
-
+
//Then add all renderTextures
var camCount = agentParameters.agentCameras.Count;
for (int i = 0; i < agentParameters.agentRenderTextures.Count; i++)
@@ -653,13 +664,13 @@ void SendInfoToBrain()
///
/// Collects the (vector, visual, text) observations of the agent.
- /// The agent observation describes the current environment from the
+ /// The agent observation describes the current environment from the
/// perspective of the agent.
///
///
/// Simply, an agents observation is any environment information that helps
/// the Agent acheive its goal. For example, for a fighting Agent, its
- /// observation could include distances to friends or enemies, or the
+ /// observation could include distances to friends or enemies, or the
/// current level of ammunition at its disposal.
/// Recall that an Agent may attach vector, visual or textual observations.
/// Vector observations are added by calling the provided helper methods:
@@ -678,7 +689,7 @@ void SendInfoToBrain()
/// needs to match the vectorObservationSize attribute of the linked Brain.
/// Visual observations are implicitly added from the cameras attached to
/// the Agent.
- /// Lastly, textual observations are added using
+ /// Lastly, textual observations are added using
/// .
///
public virtual void CollectObservations()
@@ -861,7 +872,7 @@ public virtual void AgentAction(float[] vectorAction, string textAction, Communi
}
///
- /// Specifies the agent behavior when done and
+ /// Specifies the agent behavior when done and
/// is false. This method can be
/// used to remove the agent from the scene.
///
@@ -906,12 +917,12 @@ public void UpdateMemoriesAction(List memories)
{
action.memories = memories;
}
-
+
public void AppendMemoriesAction(List memories)
{
action.memories.AddRange(memories);
}
-
+
public List GetMemoriesAction()
{
return action.memories;
@@ -966,9 +977,9 @@ protected float ScaleAction(float rawAction, float min, float max)
///
/// Sets the status of the agent.
///
- /// If set to true
+ /// If set to true
/// The agent must set maxStepReached.
- /// If set to true
+ /// If set to true
/// The agent must set done.
/// Number of current steps in episode
void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
@@ -984,7 +995,7 @@ void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
maxStepReached = true;
}
- // If the Academy needs to reset, the agent should reset
+ // If the Academy needs to reset, the agent should reset
// even if it reseted recently.
if (academyDone)
{
@@ -996,7 +1007,7 @@ void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
/// Signals the agent that it must reset if its done flag is set to true.
void ResetIfDone()
{
- // If an agent is done, then it will also
+ // If an agent is done, then it will also
// request for a decision and an action
if (IsDone())
{
@@ -1126,14 +1137,14 @@ public static Texture2D ObservationToTexture(Camera obsCamera, int width, int he
obsCamera.Render();
texture2D.ReadPixels(new Rect(0, 0, texture2D.width, texture2D.height), 0, 0);
-
+
obsCamera.targetTexture = prevCameraRT;
obsCamera.rect = oldRec;
RenderTexture.active = prevActiveRT;
RenderTexture.ReleaseTemporary(tempRT);
return texture2D;
}
-
+
///
/// Converts a RenderTexture and correspinding resolution to a 2D texture.
///
@@ -1150,7 +1161,7 @@ public static Texture2D ObservationToTexture(RenderTexture obsTexture, int width
{
texture2D.Resize(width, height);
}
-
+
if(width != obsTexture.width || height != obsTexture.height)
{
throw new UnityAgentsException(string.Format(
@@ -1175,5 +1186,5 @@ public void SetCustomObservation(CustomObservation customObservation)
{
info.customObservation = customObservation;
}
- }
+ }
}
diff --git a/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs b/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
index a574b7f8f2..30f6495584 100644
--- a/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
@@ -6,15 +6,15 @@
namespace MLAgents
{
///
- /// The batcher is an RL specific class that makes sure that the information each object in
- /// Unity (Academy and Brains) wants to send to External is appropriately batched together
+ /// The batcher is an RL specific class that makes sure that the information each object in
+ /// Unity (Academy and Brains) wants to send to External is appropriately batched together
/// and sent only when necessary.
- ///
+ ///
/// The Batcher will only send a Message to the Communicator when either :
/// 1 - The academy is done
/// 2 - At least one brain has data to send
- ///
- /// At each step, the batcher will keep track of the brains that queried the batcher for that
+ ///
+ /// At each step, the batcher will keep track of the brains that queried the batcher for that
/// step. The batcher can only send the batched data when all the Brains have queried the
/// Batcher.
///
@@ -67,7 +67,7 @@ public Batcher(Communicator communicator)
}
///
- /// Sends the academy parameters through the Communicator.
+ /// Sends the academy parameters through the Communicator.
/// Is used by the academy to send the AcademyParameters to the communicator.
///
/// The External Initialization Parameters received.
@@ -104,7 +104,7 @@ public CommunicatorObjects.UnityRLInitializationInput SendAcademyParameters(
/// Registers the done flag of the academy to the next output to be sent
/// to the communicator.
///
- /// If set to true
+ /// If set to true
/// The academy done state will be sent to External at the next Exchange.
public void RegisterAcademyDoneFlag(bool done)
{
@@ -164,7 +164,7 @@ public void SubscribeBrain(string brainKey)
///
/// Sends the brain info. If at least one brain has an agent in need of
- /// a decision or if the academy is done, the data is sent via
+ /// a decision or if the academy is done, the data is sent via
/// Communicator. Else, a new step is realized. The data can only be
/// sent once all the brains that subscribed to the batcher have tried
/// to send information.
@@ -198,6 +198,9 @@ public void SendBrainInfo(
{
CommunicatorObjects.AgentInfoProto agentInfoProto = agentInfo[agent].ToProto();
m_currentUnityRLOutput.AgentInfos[brainKey].Value.Add(agentInfoProto);
+ // Avoid visual obs memory leak. This should be called AFTER we are done with the visual obs.
+ // e.g. after recording them to demo and using them for inference.
+ agentInfo[agent].ClearVisualObs();
}
m_hasData[brainKey] = true;
diff --git a/gym-unity/setup.py b/gym-unity/setup.py
index f5ccf499c3..64cb242bd6 100755
--- a/gym-unity/setup.py
+++ b/gym-unity/setup.py
@@ -4,12 +4,12 @@
setup(
name="gym_unity",
- version="0.4.3",
+ version="0.4.4",
description="Unity Machine Learning Agents Gym Interface",
license="Apache License 2.0",
author="Unity Technologies",
author_email="ML-Agents@unity3d.com",
url="https://github.com/Unity-Technologies/ml-agents",
packages=find_packages(),
- install_requires=["gym", "mlagents_envs==0.9.0"],
+ install_requires=["gym", "mlagents_envs==0.9.1"],
)
diff --git a/ml-agents-envs/mlagents/envs/sampler_class.py b/ml-agents-envs/mlagents/envs/sampler_class.py
index cfff08253d..2fbd83d6fe 100644
--- a/ml-agents-envs/mlagents/envs/sampler_class.py
+++ b/ml-agents-envs/mlagents/envs/sampler_class.py
@@ -19,13 +19,27 @@ class UniformSampler(Sampler):
"""
def __init__(
- self, min_value: Union[int, float], max_value: Union[int, float], **kwargs
+ self,
+ min_value: Union[int, float],
+ max_value: Union[int, float],
+ seed: Optional[int] = None,
+ **kwargs
) -> None:
+ """
+ :param min_value: minimum value of the range to be sampled uniformly from
+ :param max_value: maximum value of the range to be sampled uniformly from
+ :param seed: Random seed used for making draws from the uniform sampler
+ """
self.min_value = min_value
self.max_value = max_value
+ # Draw from random state to allow for consistent reset parameter draw for a seed
+ self.random_state = np.random.RandomState(seed)
def sample_parameter(self) -> float:
- return np.random.uniform(self.min_value, self.max_value)
+ """
+ Draws and returns a sample from the specified interval
+ """
+ return self.random_state.uniform(self.min_value, self.max_value)
class MultiRangeUniformSampler(Sampler):
@@ -36,19 +50,33 @@ class MultiRangeUniformSampler(Sampler):
it proceeds to pick a value uniformly in that range.
"""
- def __init__(self, intervals: List[List[Union[int, float]]], **kwargs) -> None:
+ def __init__(
+ self,
+ intervals: List[List[Union[int, float]]],
+ seed: Optional[int] = None,
+ **kwargs
+ ) -> None:
+ """
+ :param intervals: List of intervals to draw uniform samples from
+ :param seed: Random seed used for making uniform draws from the specified intervals
+ """
self.intervals = intervals
# Measure the length of the intervals
interval_lengths = [abs(x[1] - x[0]) for x in self.intervals]
cum_interval_length = sum(interval_lengths)
# Assign weights to an interval proportionate to the interval size
self.interval_weights = [x / cum_interval_length for x in interval_lengths]
+ # Draw from random state to allow for consistent reset parameter draw for a seed
+ self.random_state = np.random.RandomState(seed)
def sample_parameter(self) -> float:
+ """
+ Selects an interval to pick and then draws a uniform sample from the picked interval
+ """
cur_min, cur_max = self.intervals[
- np.random.choice(len(self.intervals), p=self.interval_weights)
+ self.random_state.choice(len(self.intervals), p=self.interval_weights)
]
- return np.random.uniform(cur_min, cur_max)
+ return self.random_state.uniform(cur_min, cur_max)
class GaussianSampler(Sampler):
@@ -58,13 +86,27 @@ class GaussianSampler(Sampler):
"""
def __init__(
- self, mean: Union[float, int], st_dev: Union[float, int], **kwargs
+ self,
+ mean: Union[float, int],
+ st_dev: Union[float, int],
+ seed: Optional[int] = None,
+ **kwargs
) -> None:
+ """
+ :param mean: Specifies the mean of the gaussian distribution to draw from
+ :param st_dev: Specifies the standard devation of the gaussian distribution to draw from
+ :param seed: Random seed used for making gaussian draws from the sample
+ """
self.mean = mean
self.st_dev = st_dev
+ # Draw from random state to allow for consistent reset parameter draw for a seed
+ self.random_state = np.random.RandomState(seed)
def sample_parameter(self) -> float:
- return np.random.normal(self.mean, self.st_dev)
+ """
+ Returns a draw from the specified Gaussian distribution
+ """
+ return self.random_state.normal(self.mean, self.st_dev)
class SamplerFactory:
@@ -81,10 +123,23 @@ class SamplerFactory:
@staticmethod
def register_sampler(name: str, sampler_cls: Type[Sampler]) -> None:
+ """
+ Registers the sampe in the Sampler Factory to be used later
+ :param name: String name to set as key for the sampler_cls in the factory
+ :param sampler_cls: Sampler object to associate to the name in the factory
+ """
SamplerFactory.NAME_TO_CLASS[name] = sampler_cls
@staticmethod
- def init_sampler_class(name: str, params: Dict[str, Any]):
+ def init_sampler_class(
+ name: str, params: Dict[str, Any], seed: Optional[int] = None
+ ) -> Sampler:
+ """
+ Initializes the sampler class associated with the name with the params
+ :param name: Name of the sampler in the factory to initialize
+ :param params: Parameters associated to the sampler attached to the name
+ :param seed: Random seed to be used to set deterministic random draws for the sampler
+ """
if name not in SamplerFactory.NAME_TO_CLASS:
raise SamplerException(
name + " sampler is not registered in the SamplerFactory."
@@ -92,6 +147,7 @@ def init_sampler_class(name: str, params: Dict[str, Any]):
" associated to your sampler in the SamplerFactory."
)
sampler_cls = SamplerFactory.NAME_TO_CLASS[name]
+ params["seed"] = seed
try:
return sampler_cls(**params)
except TypeError:
@@ -103,7 +159,13 @@ def init_sampler_class(name: str, params: Dict[str, Any]):
class SamplerManager:
- def __init__(self, reset_param_dict: Dict[str, Any]) -> None:
+ def __init__(
+ self, reset_param_dict: Dict[str, Any], seed: Optional[int] = None
+ ) -> None:
+ """
+ :param reset_param_dict: Arguments needed for initializing the samplers
+ :param seed: Random seed to be used for drawing samples from the samplers
+ """
self.reset_param_dict = reset_param_dict if reset_param_dict else {}
assert isinstance(self.reset_param_dict, dict)
self.samplers: Dict[str, Sampler] = {}
@@ -116,7 +178,7 @@ def __init__(self, reset_param_dict: Dict[str, Any]) -> None:
)
sampler_name = cur_param_dict.pop("sampler-type")
param_sampler = SamplerFactory.init_sampler_class(
- sampler_name, cur_param_dict
+ sampler_name, cur_param_dict, seed
)
self.samplers[param_name] = param_sampler
@@ -128,6 +190,10 @@ def is_empty(self) -> bool:
return not bool(self.samplers)
def sample_all(self) -> Dict[str, float]:
+ """
+ Loop over all samplers and draw a sample from each one for generating
+ next set of reset parameter values.
+ """
res = {}
for param_name, param_sampler in list(self.samplers.items()):
res[param_name] = param_sampler.sample_parameter()
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index 6edacf1721..06d8131546 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -5,7 +5,7 @@
setup(
name="mlagents_envs",
- version="0.9.0",
+ version="0.9.1",
description="Unity Machine Learning Agents Interface",
url="https://github.com/Unity-Technologies/ml-agents",
author="Unity Technologies",
diff --git a/ml-agents/mlagents/trainers/bc/policy.py b/ml-agents/mlagents/trainers/bc/policy.py
index 10faf7fab9..2c31a19dec 100644
--- a/ml-agents/mlagents/trainers/bc/policy.py
+++ b/ml-agents/mlagents/trainers/bc/policy.py
@@ -79,24 +79,14 @@ def update(self, mini_batch, num_sequences):
self.model.sequence_length: self.sequence_length,
}
if self.use_continuous_act:
- feed_dict[self.model.true_action] = mini_batch["actions"].reshape(
- [-1, self.brain.vector_action_space_size[0]]
- )
+ feed_dict[self.model.true_action] = mini_batch["actions"]
else:
- feed_dict[self.model.true_action] = mini_batch["actions"].reshape(
- [-1, len(self.brain.vector_action_space_size)]
- )
+ feed_dict[self.model.true_action] = mini_batch["actions"]
feed_dict[self.model.action_masks] = np.ones(
(num_sequences, sum(self.brain.vector_action_space_size))
)
if self.use_vec_obs:
- apparent_obs_size = (
- self.brain.vector_observation_space_size
- * self.brain.num_stacked_vector_observations
- )
- feed_dict[self.model.vector_in] = mini_batch["vector_obs"].reshape(
- [-1, apparent_obs_size]
- )
+ feed_dict[self.model.vector_in] = mini_batch["vector_obs"]
for i, _ in enumerate(self.model.visual_in):
visual_obs = mini_batch["visual_obs%d" % i]
feed_dict[self.model.visual_in[i]] = visual_obs
diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
index 59da9fa34f..7f53bcacf4 100644
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -124,7 +124,7 @@ def update_policy(self):
"""
Updates the policy.
"""
- self.demonstration_buffer.update_buffer.shuffle()
+ self.demonstration_buffer.update_buffer.shuffle(self.policy.sequence_length)
batch_losses = []
num_batches = min(
len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences,
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
index fd60b6306f..009e2663df 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
@@ -42,7 +42,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
# Create input ops for next (t+1) visual observations.
next_visual_input = LearningModel.create_visual_input(
self.policy_model.brain.camera_resolutions[i],
- name="next_visual_observation_" + str(i),
+ name="curiosity_next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)
@@ -53,7 +53,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
self.encoding_size,
LearningModel.swish,
1,
- "stream_{}_visual_obs_encoder".format(i),
+ "curiosity_stream_{}_visual_obs_encoder".format(i),
False,
)
@@ -62,7 +62,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
self.encoding_size,
LearningModel.swish,
1,
- "stream_{}_visual_obs_encoder".format(i),
+ "curiosity_stream_{}_visual_obs_encoder".format(i),
True,
)
visual_encoders.append(encoded_visual)
@@ -80,7 +80,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
self.next_vector_in = tf.placeholder(
shape=[None, self.policy_model.vec_obs_size],
dtype=tf.float32,
- name="next_vector_observation",
+ name="curiosity_next_vector_observation",
)
encoded_vector_obs = self.policy_model.create_vector_observation_encoder(
@@ -88,7 +88,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
self.encoding_size,
LearningModel.swish,
2,
- "vector_obs_encoder",
+ "curiosity_vector_obs_encoder",
False,
)
encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder(
@@ -96,7 +96,7 @@ def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
self.encoding_size,
LearningModel.swish,
2,
- "vector_obs_encoder",
+ "curiosity_vector_obs_encoder",
True,
)
encoded_state_list.append(encoded_vector_obs)
diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
index bb8d15cabd..6f33590bdf 100644
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
@@ -114,7 +114,7 @@ def make_inputs(self) -> None:
# Create input ops for next (t+1) visual observations.
visual_input = self.policy_model.create_visual_input(
self.policy_model.brain.camera_resolutions[i],
- name="visual_observation_" + str(i),
+ name="gail_visual_observation_" + str(i),
)
self.expert_visual_in.append(visual_input)
@@ -123,7 +123,7 @@ def make_inputs(self) -> None:
self.encoding_size,
LearningModel.swish,
1,
- "stream_{}_visual_obs_encoder".format(i),
+ "gail_stream_{}_visual_obs_encoder".format(i),
False,
)
@@ -132,7 +132,7 @@ def make_inputs(self) -> None:
self.encoding_size,
LearningModel.swish,
1,
- "stream_{}_visual_obs_encoder".format(i),
+ "gail_stream_{}_visual_obs_encoder".format(i),
True,
)
visual_policy_encoders.append(encoded_policy_visual)
@@ -165,7 +165,7 @@ def create_encoder(
concat_input,
self.h_size,
activation=LearningModel.swish,
- name="d_hidden_1",
+ name="gail_d_hidden_1",
reuse=reuse,
)
@@ -173,7 +173,7 @@ def create_encoder(
hidden_1,
self.h_size,
activation=LearningModel.swish,
- name="d_hidden_2",
+ name="gail_d_hidden_2",
reuse=reuse,
)
@@ -184,7 +184,7 @@ def create_encoder(
hidden_2,
self.z_size,
reuse=reuse,
- name="z_mean",
+ name="gail_z_mean",
kernel_initializer=LearningModel.scaled_init(0.01),
)
@@ -200,7 +200,7 @@ def create_encoder(
estimate_input,
1,
activation=tf.nn.sigmoid,
- name="d_estimate",
+ name="gail_d_estimate",
reuse=reuse,
)
return estimate, z_mean, concat_input
@@ -211,7 +211,7 @@ def create_network(self) -> None:
"""
if self.use_vail:
self.z_sigma = tf.get_variable(
- "sigma_vail",
+ "gail_sigma_vail",
self.z_size,
dtype=tf.float32,
initializer=tf.ones_initializer(),
@@ -219,7 +219,7 @@ def create_network(self) -> None:
self.z_sigma_sq = self.z_sigma * self.z_sigma
self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
self.use_noise = tf.placeholder(
- shape=[1], dtype=tf.float32, name="NoiseLevel"
+ shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
)
self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
self.encoded_expert, self.expert_action, self.done_expert, reuse=False
@@ -231,7 +231,7 @@ def create_network(self) -> None:
reuse=True,
)
self.discriminator_score = tf.reshape(
- self.policy_estimate, [-1], name="GAIL_reward"
+ self.policy_estimate, [-1], name="gail_reward"
)
self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index 19f7457980..4f463cfd6e 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -93,7 +93,7 @@ def run_training(
env = SubprocessEnvManager(env_factory, num_envs)
maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env, lesson)
sampler_manager, resampling_interval = create_sampler_manager(
- sampler_file_path, env.reset_parameters
+ sampler_file_path, env.reset_parameters, run_seed
)
trainers = initialize_trainers(
@@ -132,7 +132,7 @@ def run_training(
tc.start_learning(env)
-def create_sampler_manager(sampler_file_path, env_reset_params):
+def create_sampler_manager(sampler_file_path, env_reset_params, run_seed=None):
sampler_config = None
resample_interval = None
if sampler_file_path is not None:
@@ -150,7 +150,7 @@ def create_sampler_manager(sampler_file_path, env_reset_params):
"Resampling interval was not specified in the sampler file."
" Please specify it with the 'resampling-interval' key in the sampler config file."
)
- sampler_manager = SamplerManager(sampler_config)
+ sampler_manager = SamplerManager(sampler_config, run_seed)
return sampler_manager, resample_interval
diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py
index fbf9443b14..5afe7fa6d2 100644
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
@@ -149,3 +149,23 @@ def create_buffer(brain_infos, brain_params, sequence_length):
buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length)
return buffer
+
+
+def create_mock_3dball_brain():
+ mock_brain = create_mock_brainparams(
+ vector_action_space_type="continuous",
+ vector_action_space_size=[2],
+ vector_observation_space_size=8,
+ )
+ mock_brain.brain_name = "Ball3DBrain"
+ return mock_brain
+
+
+def create_mock_banana_brain():
+ mock_brain = create_mock_brainparams(
+ number_visual_observations=1,
+ vector_action_space_type="discrete",
+ vector_action_space_size=[3, 3, 3, 2],
+ vector_observation_space_size=0,
+ )
+ return mock_brain
diff --git a/ml-agents/mlagents/trainers/tests/test_bc.py b/ml-agents/mlagents/trainers/tests/test_bc.py
index 800ab4a23f..3d162dbcee 100644
--- a/ml-agents/mlagents/trainers/tests/test_bc.py
+++ b/ml-agents/mlagents/trainers/tests/test_bc.py
@@ -1,12 +1,15 @@
import unittest.mock as mock
import pytest
+import os
import numpy as np
import tensorflow as tf
import yaml
from mlagents.trainers.bc.models import BehavioralCloningModel
+import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.bc.policy import BCPolicy
+from mlagents.trainers.bc.offline_trainer import BCTrainer
from mlagents.envs import UnityEnvironment
from mlagents.envs.mock_communicator import MockCommunicator
@@ -21,10 +24,37 @@ def dummy_config():
use_recurrent: false
sequence_length: 32
memory_size: 32
+ batches_per_epoch: 1
+ batch_size: 32
+ summary_freq: 2000
+ max_steps: 4000
"""
)
+@mock.patch("mlagents.envs.UnityEnvironment")
+def test_bc_trainer(mock_env, dummy_config):
+ mock_brain = mb.create_mock_3dball_brain()
+ mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8)
+ mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
+ env = mock_env()
+
+ trainer_parameters = dummy_config
+ trainer_parameters["summary_path"] = "tmp"
+ trainer_parameters["model_path"] = "tmp"
+ trainer_parameters["demo_path"] = (
+ os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
+ )
+ trainer = BCTrainer(
+ mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0
+ )
+ trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100)
+ trainer.update_policy()
+ assert len(trainer.stats["Losses/Cloning Loss"]) > 0
+ trainer.increment_step(1)
+ assert trainer.step == 1
+
+
@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher")
@mock.patch("mlagents.envs.UnityEnvironment.get_communicator")
def test_bc_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
diff --git a/ml-agents/mlagents/trainers/tests/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
index 0eee0f4d2e..d57741448b 100644
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
@@ -42,25 +42,6 @@ def dummy_config():
)
-def create_mock_3dball_brain():
- mock_brain = mb.create_mock_brainparams(
- vector_action_space_type="continuous",
- vector_action_space_size=[2],
- vector_observation_space_size=8,
- )
- return mock_brain
-
-
-def create_mock_banana_brain():
- mock_brain = mb.create_mock_brainparams(
- number_visual_observations=1,
- vector_action_space_type="discrete",
- vector_action_space_size=[3, 3, 3, 2],
- vector_observation_space_size=0,
- )
- return mock_brain
-
-
def create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, use_rnn, demo_file
):
@@ -84,7 +65,7 @@ def create_ppo_policy_with_bc_mock(
@mock.patch("mlagents.envs.UnityEnvironment")
def test_bcmodule_defaults(mock_env, dummy_config):
# See if default values match
- mock_brain = create_mock_3dball_brain()
+ mock_brain = mb.create_mock_3dball_brain()
env, policy = create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, False, "test.demo"
)
@@ -105,7 +86,7 @@ def test_bcmodule_defaults(mock_env, dummy_config):
# Test with continuous control env and vector actions
@mock.patch("mlagents.envs.UnityEnvironment")
def test_bcmodule_update(mock_env, dummy_config):
- mock_brain = create_mock_3dball_brain()
+ mock_brain = mb.create_mock_3dball_brain()
env, policy = create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, False, "test.demo"
)
@@ -118,7 +99,7 @@ def test_bcmodule_update(mock_env, dummy_config):
# Test with RNN
@mock.patch("mlagents.envs.UnityEnvironment")
def test_bcmodule_rnn_update(mock_env, dummy_config):
- mock_brain = create_mock_3dball_brain()
+ mock_brain = mb.create_mock_3dball_brain()
env, policy = create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, True, "test.demo"
)
@@ -131,7 +112,7 @@ def test_bcmodule_rnn_update(mock_env, dummy_config):
# Test with discrete control and visual observations
@mock.patch("mlagents.envs.UnityEnvironment")
def test_bcmodule_dc_visual_update(mock_env, dummy_config):
- mock_brain = create_mock_banana_brain()
+ mock_brain = mb.create_mock_banana_brain()
env, policy = create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, False, "testdcvis.demo"
)
@@ -144,7 +125,7 @@ def test_bcmodule_dc_visual_update(mock_env, dummy_config):
# Test with discrete control, visual observations and RNN
@mock.patch("mlagents.envs.UnityEnvironment")
def test_bcmodule_rnn_dc_update(mock_env, dummy_config):
- mock_brain = create_mock_banana_brain()
+ mock_brain = mb.create_mock_banana_brain()
env, policy = create_ppo_policy_with_bc_mock(
mock_env, mock_brain, dummy_config, True, "testdcvis.demo"
)
diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index c201ed5f27..0df6cf33d5 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -10,7 +10,7 @@
setup(
name="mlagents",
- version="0.9.0",
+ version="0.9.1",
description="Unity Machine Learning Agents",
long_description=long_description,
long_description_content_type="text/markdown",
@@ -29,7 +29,7 @@
),
zip_safe=False,
install_requires=[
- "mlagents_envs==0.9.0",
+ "mlagents_envs==0.9.1",
"tensorflow>=1.7,<1.8",
"Pillow>=4.2.1",
"matplotlib",