From d523c8cc0ac0214963eee5323f717c67c647e540 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 7 Aug 2020 16:14:21 -0700 Subject: [PATCH 01/17] Running LSTM for SAC --- .../mlagents/trainers/sac/optimizer_torch.py | 100 +++++++++++++++--- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 9c3ced80a7..84df073fc0 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -1,7 +1,8 @@ import numpy as np -from typing import Dict, List, Mapping, cast, Tuple +from typing import Dict, List, Mapping, cast, Tuple, Optional import torch from torch import nn +import attr from mlagents_envs.logging_util import get_logger from mlagents_envs.base_env import ActionType @@ -56,10 +57,24 @@ def forward( self, vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], - actions: torch.Tensor = None, + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions=actions) - q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions=actions) + q1_out, _ = self.q1_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + q2_out, _ = self.q2_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) return q1_out, q2_out def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): @@ -87,17 +102,28 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): for name in self.stream_names } + # Critics should have 1/2 of the memory of the policy + critic_memory = policy_network_settings.memory + if critic_memory is not None: + critic_memory = attr.evolve( + critic_memory, memory_size=critic_memory.memory_size // 2 + ) + value_network_settings = attr.evolve( + policy_network_settings, memory=critic_memory + ) + self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, - policy_network_settings, + value_network_settings, self.policy.behavior_spec.action_type, self.act_size, ) + self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, - policy_network_settings, + value_network_settings, ) self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) @@ -232,7 +258,6 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) - # print(log_probs, v_backup, _ent_coef, loss_masks) value_loss = 0.5 * torch.mean( loss_masks * torch.nn.functional.mse_loss(values[name], v_backup) ) @@ -369,12 +394,30 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) - memories = [ + memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] - if len(memories) > 0: - memories = torch.stack(memories).unsqueeze(0) + # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. + offset = 1 if self.policy.sequence_length > 1 else 0 + next_memories_list = [ + ModelUtils.list_to_tensor( + batch["memory"][i][: self.policy.m_size // 2] + ) # only pass value part of memory to target network + for i in range(offset, len(batch["memory"]), self.policy.sequence_length) + ] + + if len(memories_list) > 0: + memories = torch.stack(memories_list).unsqueeze(0) + next_memories = torch.stack(next_memories_list).unsqueeze(0) + else: + memories = None + next_memories = None + # Q network memories are 0'ed out, since we don't have them during inference. + q_memories = torch.zeros( + (memories.shape[0], memories.shape[1], memories.shape[2] // 2) + ) + vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: @@ -415,18 +458,45 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: ) if self.policy.use_continuous_act: squeezed_actions = actions.squeeze(-1) - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) - q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + sampled_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + squeezed_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) q1_stream, q2_stream = q1_out, q2_out else: with torch.no_grad(): - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) - q1_out, q2_out = self.value_network(vec_obs, vis_obs) + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) with torch.no_grad(): - target_values, _ = self.target_network(next_vec_obs, next_vis_obs) + target_values, _ = self.target_network( + next_vec_obs, + next_vis_obs, + memories=next_memories, + sequence_length=self.policy.sequence_length, + ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) From f2873b296fe60fefb86545405c6076e26d10af5e Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 10:38:19 -0700 Subject: [PATCH 02/17] Use correct half of memories --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 84df073fc0..3ba92e6c1b 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -402,7 +402,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( - batch["memory"][i][: self.policy.m_size // 2] + batch["memory"][i][self.policy.m_size // 2 :] ) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] From b97b1e535af6006bd864249693c78460bc721877 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 17:40:47 -0700 Subject: [PATCH 03/17] Fix policy memory storinig --- ml-agents/mlagents/trainers/policy/torch_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index d6dd822646..9d935d4676 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -186,7 +186,7 @@ def evaluate( run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) run_out["learning_rate"] = 0.0 if self.use_recurrent: - run_out["memories"] = memories.detach().cpu().numpy() + run_out["memory_out"] = memories.detach().cpu().numpy().squeeze(0) return run_out def get_action( From cd509ddbbdc3e4025040105f9d521034a37fdf3a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 18:12:27 -0700 Subject: [PATCH 04/17] Fix SeparateActorCritic and add test --- ml-agents/mlagents/trainers/tests/torch/test_networks.py | 6 +++++- ml-agents/mlagents/trainers/torch/networks.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_networks.py b/ml-agents/mlagents/trainers/tests/torch/test_networks.py index ff5209b676..19030aeafc 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py @@ -203,7 +203,11 @@ def test_actor_critic(ac_type, lstm): assert value_out[stream].shape == (1,) # Test get_dist_and_value - dists, value_out, _ = actor.get_dist_and_value([sample_obs], [], memories=memories) + dists, value_out, mem_out = actor.get_dist_and_value( + [sample_obs], [], memories=memories + ) + if mem_out is not None: + assert mem_out.shape == memories.shape for dist in dists: assert isinstance(dist, GaussianDistInstance) for stream in stream_names: diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 15f0d92e2d..aff61ce0f6 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -463,7 +463,7 @@ def get_dist_and_value( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) if self.use_lstm: - mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=1) + mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1) else: mem_out = None return dists, value_outputs, mem_out From 07bb4c0818887dab74e75236f51f6ca1533bde80 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 19:15:30 -0700 Subject: [PATCH 05/17] Use loss masks in PPO. --- .../mlagents/trainers/ppo/optimizer_torch.py | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 9bbb7b51c8..13fdc12e01 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -61,12 +61,15 @@ def ppo_value_loss( old_values: Dict[str, torch.Tensor], returns: Dict[str, torch.Tensor], epsilon: float, + loss_masks: torch.Tensor, ) -> torch.Tensor: """ - Creates training-specific Tensorflow ops for PPO models. - :param returns: - :param old_values: - :param values: + Evaluates value loss for PPO. + :param values: Value output of the current network. + :param old_values: Value stored with experiences in buffer. + :param returns: Computed returns. + :param epsilon: Clipping value for value estimate. + :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ value_losses = [] for name, head in values.items(): @@ -77,18 +80,25 @@ def ppo_value_loss( ) v_opt_a = (returns_tensor - head) ** 2 v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 - value_loss = torch.mean(torch.max(v_opt_a, v_opt_b)) + masked_loss = torch.max(v_opt_a, v_opt_b) * loss_masks + value_loss = torch.mean(masked_loss) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) return value_loss - def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks): + def ppo_policy_loss( + self, + advantages: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + loss_masks: torch.Tensor, + ) -> torch.Tensor: """ - Creates training-specific Tensorflow ops for PPO models. - :param masks: - :param advantages: + Evaluate PPO policy loss. + :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities + :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) @@ -99,7 +109,8 @@ def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks): p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b)) + masked_loss = torch.min(p_opt_a, p_opt_b) * loss_masks + policy_loss = -torch.mean(masked_loss) return policy_loss @timed @@ -153,14 +164,21 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + value_loss = self.ppo_value_loss( + values, old_values, returns, decay_eps, loss_masks + ) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch["advantages"]), log_probs, ModelUtils.list_to_tensor(batch["action_probs"]), - ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32), + loss_masks, + ) + loss = ( + policy_loss + + 0.5 * value_loss + - decay_bet * torch.mean(entropy * loss_masks) ) - loss = policy_loss + 0.5 * value_loss - decay_bet * torch.mean(entropy) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) From 0a3c795cee4197003d5c7f5e96b03925e14f9896 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 19:46:34 -0700 Subject: [PATCH 06/17] Proper shape of masks --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 13fdc12e01..1232429f36 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -109,7 +109,7 @@ def ppo_policy_loss( p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - masked_loss = torch.min(p_opt_a, p_opt_b) * loss_masks + masked_loss = torch.min(p_opt_a, p_opt_b).flatten() * loss_masks policy_loss = -torch.mean(masked_loss) return policy_loss @@ -164,7 +164,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.float32) value_loss = self.ppo_value_loss( values, old_values, returns, decay_eps, loss_masks ) @@ -177,7 +177,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: loss = ( policy_loss + 0.5 * value_loss - - decay_bet * torch.mean(entropy * loss_masks) + - decay_bet * torch.mean(entropy.flatten() * loss_masks) ) # Set optimizer learning rate From 2337d15c09e2cdd91ec5d0c08eb63a2ba5d5e297 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 20:04:48 -0700 Subject: [PATCH 07/17] Proper mask mean for PPO --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 14 +++++++------- ml-agents/mlagents/trainers/torch/utils.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 1232429f36..b330bce0fb 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -80,8 +80,7 @@ def ppo_value_loss( ) v_opt_a = (returns_tensor - head) ** 2 v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 - masked_loss = torch.max(v_opt_a, v_opt_b) * loss_masks - value_loss = torch.mean(masked_loss) + value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) return value_loss @@ -109,8 +108,9 @@ def ppo_policy_loss( p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - masked_loss = torch.min(p_opt_a, p_opt_b).flatten() * loss_masks - policy_loss = -torch.mean(masked_loss) + policy_loss = -1 * ModelUtils.masked_mean( + torch.min(p_opt_a, p_opt_b).flatten(), loss_masks + ) return policy_loss @timed @@ -138,7 +138,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.bool) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) @@ -164,7 +164,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.float32) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) value_loss = self.ppo_value_loss( values, old_values, returns, decay_eps, loss_masks ) @@ -177,7 +177,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: loss = ( policy_loss + 0.5 * value_loss - - decay_bet * torch.mean(entropy.flatten() * loss_masks) + - decay_bet * ModelUtils.masked_mean(entropy.flatten(), loss_masks) ) # Set optimizer learning rate diff --git a/ml-agents/mlagents/trainers/torch/utils.py b/ml-agents/mlagents/trainers/torch/utils.py index baa99c98ef..ba6b7d57a0 100644 --- a/ml-agents/mlagents/trainers/torch/utils.py +++ b/ml-agents/mlagents/trainers/torch/utils.py @@ -284,3 +284,13 @@ def get_probs_and_entropy( else: all_probs = torch.cat(all_probs_list, dim=-1) return log_probs, entropies, all_probs + + @staticmethod + def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: + """ + Returns the mean of the tensor but ignoring the values specified by masks. + Used for masking out loss functions. + :param tensor: Tensor which needs mean computation. + :param masks: Boolean tensor of masks with same dimension as tensor. + """ + return (tensor * masks).sum() / masks.float().sum() From 1f69102f45dd5bfd7a0c8e63fdee701af7a25632 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 20:17:47 -0700 Subject: [PATCH 08/17] Fix dtype for actions --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index b330bce0fb..e162166481 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -138,7 +138,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.bool) + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) From c0a77f76b6f9b1c6a3187b9c56e49a7228605e7b Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 14:13:17 -0700 Subject: [PATCH 09/17] Proper initialization and SAC masking --- .../mlagents/trainers/sac/optimizer_torch.py | 27 +++++++++---------- ml-agents/mlagents/trainers/torch/layers.py | 24 +++++++++++++++++ ml-agents/mlagents/trainers/torch/networks.py | 13 +++++---- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 3ba92e6c1b..6d110f99f5 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -194,11 +194,11 @@ def sac_q_loss( * self.gammas[i] * target_values[name] ) - _q1_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(q_backup, q1_stream) + _q1_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q1_stream), loss_masks ) - _q2_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(q_backup, q2_stream) + _q2_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q2_stream), loss_masks ) q1_losses.append(_q1_loss) @@ -258,8 +258,8 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) - value_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(values[name], v_backup) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup), loss_masks ) value_losses.append(value_loss) else: @@ -278,9 +278,9 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.mean( branched_ent_bonus, axis=0 ) - value_loss = 0.5 * torch.mean( - loss_masks - * torch.nn.functional.mse_loss(values[name], v_backup.squeeze()) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), + loss_masks, ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) @@ -300,7 +300,7 @@ def sac_policy_loss( if not discrete: mean_q1 = mean_q1.unsqueeze(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) - policy_loss = torch.mean(loss_masks * batch_policy_loss) + policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) else: action_probs = log_probs.exp() branched_per_action_ent = ModelUtils.break_into_branches( @@ -347,9 +347,8 @@ def sac_entropy_loss( target_current_diff = torch.squeeze( target_current_diff_branched, axis=2 ) - entropy_loss = -torch.mean( - loss_masks - * torch.mean(self._log_ent_coef * target_current_diff, axis=1) + entropy_loss = -1 * ModelUtils.masked_mean( + torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks ) return entropy_loss @@ -497,7 +496,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=next_memories, sequence_length=self.policy.sequence_length, ) - masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index 8dbb1cbcb4..d1c68887df 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -46,3 +46,27 @@ def linear_layer( layer.weight.data *= kernel_gain _init_methods[bias_init](layer.bias.data) return layer + + +def lstm_layer( + input_size: int, + hidden_size: int, + num_layers: int = 1, + batch_first: bool = True, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, +) -> torch.nn.Module: + """ + Creates a torch.nn.LSTM and initializes its weights and biases. Provides a + forget_bias offset like is done in TensorFlow. + """ + lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first) + # Add forget_bias to forget gate bias + for name, param in lstm.named_parameters(): + if "weight" in name: + _init_methods[kernel_init](param.data) + elif "bias" in name: + _init_methods[bias_init](param.data) + param.data[hidden_size : 2 * hidden_size].add_(forget_bias) + return lstm diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index aff61ce0f6..b60cfbe543 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -14,6 +14,7 @@ from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.torch.utils import ModelUtils from mlagents.trainers.torch.decoders import ValueHeads +from mlagents.trainers.torch.layers import lstm_layer ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -50,7 +51,7 @@ def __init__( ) if self.use_lstm: - self.lstm = nn.LSTM(self.h_size, self.m_size // 2, 1) + self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True) else: self.lstm = None @@ -101,13 +102,11 @@ def forward( raise Exception("No valid inputs to network.") if self.use_lstm: - encoding = encoding.view([sequence_length, -1, self.h_size]) + # Resize to (batch, sequence length, encoding size) + encoding = encoding.reshape([-1, sequence_length, self.h_size]) memories = torch.split(memories, self.m_size // 2, dim=-1) - encoding, memories = self.lstm( - encoding.contiguous(), - (memories[0].contiguous(), memories[1].contiguous()), - ) - encoding = encoding.view([-1, self.m_size // 2]) + encoding, memories = self.lstm(encoding, (memories[0], memories[1])) + encoding = encoding.reshape([-1, self.m_size // 2]) memories = torch.cat(memories, dim=-1) return encoding, memories From f404834ea551eed1ec649af72591ce59ba1cf8d8 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 15:53:13 -0700 Subject: [PATCH 10/17] Experimental amrl layer --- ml-agents/mlagents/trainers/torch/layers.py | 39 +++++++++++++++++++ ml-agents/mlagents/trainers/torch/networks.py | 6 +-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index d1c68887df..4a8ed07374 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -70,3 +70,42 @@ def lstm_layer( _init_methods[bias_init](param.data) param.data[hidden_size : 2 * hidden_size].add_(forget_bias) return lstm + + +class AMRLMax(torch.nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + batch_first: bool = True, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, + ): + super().__init__() + self.lstm = lstm_layer( + input_size, + hidden_size, + num_layers, + batch_first, + forget_bias, + kernel_init, + bias_init, + ) + self.hidden_size = hidden_size + + def forward(self, input_tensor, h0_c0): + hidden = h0_c0 + all_out = [] + m = None + for t in range(input_tensor.shape[1]): + out, hidden = self.lstm(input_tensor[:, t : t + 1, :], hidden) + h_half, other_half = torch.split(out, self.hidden_size // 2, dim=-1) + if m is None: + m = h_half + else: + m = torch.max(m, h_half) + out = torch.cat([m, other_half]) + all_out.append(out) + return torch.cat(all_out, dim=1), hidden diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index b60cfbe543..264cef52a7 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -14,7 +14,7 @@ from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.torch.utils import ModelUtils from mlagents.trainers.torch.decoders import ValueHeads -from mlagents.trainers.torch.layers import lstm_layer +from mlagents.trainers.torch.layers import AMRLMax ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -51,9 +51,9 @@ def __init__( ) if self.use_lstm: - self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True) + self.lstm = AMRLMax(self.h_size, self.m_size // 2, batch_first=True) else: - self.lstm = None + self.lstm = None # type: ignore def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None: for vec_input, vec_enc in zip(vec_inputs, self.vector_encoders): From beab310ec471d2aa06b534a38e6f04965b76e3a4 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 16:13:59 -0700 Subject: [PATCH 11/17] Add extra FF layer --- ml-agents/mlagents/trainers/torch/layers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index 4a8ed07374..bb94a28edd 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -82,6 +82,7 @@ def __init__( forget_bias: float = 1.0, kernel_init: Initialization = Initialization.XavierGlorotUniform, bias_init: Initialization = Initialization.Zero, + num_post_layers: int = 1, ): super().__init__() self.lstm = lstm_layer( @@ -94,6 +95,18 @@ def __init__( bias_init, ) self.hidden_size = hidden_size + self.layers = [] + for _ in range(num_post_layers): + self.layers.append( + linear_layer( + input_size, + hidden_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.0, + ) + ) + self.layers.append(Swish()) + self.seq_layers = torch.nn.Sequential(*self.layers) def forward(self, input_tensor, h0_c0): hidden = h0_c0 @@ -108,4 +121,5 @@ def forward(self, input_tensor, h0_c0): m = torch.max(m, h_half) out = torch.cat([m, other_half]) all_out.append(out) - return torch.cat(all_out, dim=1), hidden + full_out = self.seq_layers(torch.cat(all_out, dim=1)) + return full_out, hidden From 6fece65403d59d23a4f220966a61d4c9b3cdd14c Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 16:58:30 -0700 Subject: [PATCH 12/17] Faster implementation --- ml-agents/mlagents/trainers/torch/layers.py | 25 ++++++++++++--------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index bb94a28edd..ce56347c29 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -99,7 +99,7 @@ def __init__( for _ in range(num_post_layers): self.layers.append( linear_layer( - input_size, + hidden_size, hidden_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, @@ -110,16 +110,19 @@ def __init__( def forward(self, input_tensor, h0_c0): hidden = h0_c0 - all_out = [] + all_c = [] m = None - for t in range(input_tensor.shape[1]): - out, hidden = self.lstm(input_tensor[:, t : t + 1, :], hidden) - h_half, other_half = torch.split(out, self.hidden_size // 2, dim=-1) + lstm_out, hidden = self.lstm(input_tensor, hidden) + h_half, other_half = torch.split(lstm_out, self.hidden_size // 2, dim=-1) + for t in range(h_half.shape[1]): + h_half_subt = h_half[:, t : t + 1, :] if m is None: - m = h_half + m = h_half_subt else: - m = torch.max(m, h_half) - out = torch.cat([m, other_half]) - all_out.append(out) - full_out = self.seq_layers(torch.cat(all_out, dim=1)) - return full_out, hidden + m = torch.max(m, h_half_subt) + all_c.append(m) + concat_c = torch.cat(all_c, dim=1) + concat_out = torch.cat([concat_c, other_half], dim=-1) + full_out = self.seq_layers(concat_out.reshape([-1, self.hidden_size])) + full_out = full_out.reshape([-1, input_tensor.shape[1], self.hidden_size]) + return concat_out, hidden From eac1dc96dbebb443e5dba1e6846eb8a24568eee8 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 17:11:56 -0700 Subject: [PATCH 13/17] Add comment --- ml-agents/mlagents/trainers/torch/layers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index ce56347c29..58c549f0d5 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -73,6 +73,11 @@ def lstm_layer( class AMRLMax(torch.nn.Module): + """ + Implements Aggregation for LSTM as described here: + https://www.microsoft.com/en-us/research/publication/amrl-aggregated-memory-for-reinforcement-learning/ + """ + def __init__( self, input_size: int, From d2e31aadf26145b01a4419b8aea9a7f19c9a174f Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 18:16:44 -0700 Subject: [PATCH 14/17] Passthrough max --- ml-agents/mlagents/trainers/torch/layers.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index 58c549f0d5..eba616cd19 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -124,10 +124,19 @@ def forward(self, input_tensor, h0_c0): if m is None: m = h_half_subt else: - m = torch.max(m, h_half_subt) + m = AMRLMax.PassthroughMax.apply(m, h_half_subt) all_c.append(m) concat_c = torch.cat(all_c, dim=1) concat_out = torch.cat([concat_c, other_half], dim=-1) full_out = self.seq_layers(concat_out.reshape([-1, self.hidden_size])) full_out = full_out.reshape([-1, input_tensor.shape[1], self.hidden_size]) return concat_out, hidden + + class PassthroughMax(torch.autograd.Function): + @staticmethod + def forward(ctx, tensor1, tensor2): + return torch.max(tensor1, tensor2) + + @staticmethod + def backward(ctx, grad_output): + return grad_output.clone(), grad_output.clone() From bf485a2006adae9595b42c8be461b2b71d7c614c Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 14 Aug 2020 11:59:31 -0700 Subject: [PATCH 15/17] Memory size abstraction and fixes --- .../mlagents/trainers/policy/torch_policy.py | 1 + ml-agents/mlagents/trainers/torch/layers.py | 26 ++++++++++++------- ml-agents/mlagents/trainers/torch/networks.py | 26 ++++++++++++++----- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 9d935d4676..4d645e1fba 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -83,6 +83,7 @@ def __init__( conditional_sigma=self.condition_sigma_on_obs, tanh_squash=tanh_squash, ) + self.m_size = self.actor_critic.memory_size self.actor_critic.to(TestingConfiguration.device) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index b1303338fb..2d353db7df 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -125,24 +125,32 @@ def __init__( self.layers.append(Swish()) self.seq_layers = torch.nn.Sequential(*self.layers) - def forward(self, input_tensor, h0_c0): - hidden = h0_c0 + @property + def memory_size(self) -> int: + return self.hidden_size // 2 + 2 * self.hidden_size + + def forward(self, input_tensor, memories): + # memories is 1/2 * hidden_size (accumulant) + hidden_size/2 (h0) + hidden_size/2 (c0) + acc, h0, c0 = torch.split( + memories, + [self.hidden_size // 2, self.hidden_size, self.hidden_size], + dim=-1, + ) + hidden = (h0, c0) all_c = [] - m = None - lstm_out, hidden = self.lstm(input_tensor, hidden) + m = acc.permute([1, 0, 2]) + lstm_out, (h0_out, c0_out) = self.lstm(input_tensor, hidden) h_half, other_half = torch.split(lstm_out, self.hidden_size // 2, dim=-1) for t in range(h_half.shape[1]): h_half_subt = h_half[:, t : t + 1, :] - if m is None: - m = h_half_subt - else: - m = AMRLMax.PassthroughMax.apply(m, h_half_subt) + m = AMRLMax.PassthroughMax.apply(m, h_half_subt) all_c.append(m) concat_c = torch.cat(all_c, dim=1) concat_out = torch.cat([concat_c, other_half], dim=-1) full_out = self.seq_layers(concat_out.reshape([-1, self.hidden_size])) full_out = full_out.reshape([-1, input_tensor.shape[1], self.hidden_size]) - return concat_out, hidden + output_mem = torch.cat([m.permute([1, 0, 2]), h0_out, c0_out], dim=-1) + return concat_out, output_mem class PassthroughMax(torch.autograd.Function): @staticmethod diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 591568aece..87ff8c84cc 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -14,7 +14,7 @@ from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.torch.utils import ModelUtils from mlagents.trainers.torch.decoders import ValueHeads -from mlagents.trainers.torch.layers import lstm_layer +from mlagents.trainers.torch.layers import AMRLMax ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -51,7 +51,7 @@ def __init__( ) if self.use_lstm: - self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True) + self.lstm = AMRLMax(self.h_size, self.m_size // 2, batch_first=True) else: self.lstm = None # type: ignore @@ -104,10 +104,10 @@ def forward( if self.use_lstm: # Resize to (batch, sequence length, encoding size) encoding = encoding.reshape([-1, sequence_length, self.h_size]) - memories = torch.split(memories, self.m_size // 2, dim=-1) + # memories = torch.split(memories, self.m_size // 2, dim=-1) encoding, memories = self.lstm(encoding, memories) encoding = encoding.reshape([-1, self.m_size // 2]) - memories = torch.cat(memories, dim=-1) + # memories = torch.cat(memories, dim=-1) return encoding, memories @@ -257,7 +257,7 @@ def __init__( self.act_type = act_type self.act_size = act_size self.version_number = torch.nn.Parameter(torch.Tensor([2.0])) - self.memory_size = torch.nn.Parameter(torch.Tensor([0])) + self.memory_size_param = torch.nn.Parameter(torch.Tensor([0])) self.is_continuous_int = torch.nn.Parameter( torch.Tensor([int(act_type == ActionType.CONTINUOUS)]) ) @@ -279,6 +279,13 @@ def __init__( self.encoding_size, act_size ) + @property + def memory_size(self) -> int: + if self.network_body.lstm is not None: + return self.network_body.lstm.memory_size + else: + return 0 + def update_normalization(self, vector_obs: List[torch.Tensor]) -> None: self.network_body.update_normalization(vector_obs) @@ -327,7 +334,7 @@ def forward( sampled_actions, dists[0].pdf(sampled_actions), self.version_number, - self.memory_size, + self.memory_size_param, self.is_continuous_int, self.act_size_vector, ) @@ -425,6 +432,13 @@ def __init__( stream_names, observation_shapes, use_network_settings ) + @property + def memory_size(self) -> int: + if self.network_body.lstm is not None: + return 2 * self.network_body.lstm.memory_size + else: + return 0 + def critic_pass( self, vec_inputs: List[torch.Tensor], From bd90e29c7b402f6a838d52d5b8baf645bc1bd16b Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 14 Aug 2020 14:25:28 -0700 Subject: [PATCH 16/17] Fix SeparateActorCritic --- ml-agents/mlagents/trainers/torch/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 87ff8c84cc..8d73912667 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -449,7 +449,7 @@ def critic_pass( actor_mem, critic_mem = None, None if self.use_lstm: # Use only the back half of memories for critic - actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1) + actor_mem, critic_mem = torch.split(memories, self.memory_size, -1) value_outputs, critic_mem_out = self.critic( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) @@ -470,7 +470,7 @@ def get_dist_and_value( ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]: if self.use_lstm: # Use only the back half of memories for critic and actor - actor_mem, critic_mem = torch.split(memories, self.half_mem_size, dim=-1) + actor_mem, critic_mem = torch.split(memories, self.memory_size, dim=-1) else: critic_mem = None actor_mem = None From 7f4ea51b226759c3c620bc1aa2772fef6ead2b5a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 14 Aug 2020 15:00:17 -0700 Subject: [PATCH 17/17] Fix SeparateActorCritic --- ml-agents/mlagents/trainers/torch/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 8d73912667..f606aafd81 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -449,7 +449,7 @@ def critic_pass( actor_mem, critic_mem = None, None if self.use_lstm: # Use only the back half of memories for critic - actor_mem, critic_mem = torch.split(memories, self.memory_size, -1) + actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1) value_outputs, critic_mem_out = self.critic( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) @@ -470,7 +470,7 @@ def get_dist_and_value( ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]: if self.use_lstm: # Use only the back half of memories for critic and actor - actor_mem, critic_mem = torch.split(memories, self.memory_size, dim=-1) + actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1) else: critic_mem = None actor_mem = None