From d523c8cc0ac0214963eee5323f717c67c647e540 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 7 Aug 2020 16:14:21 -0700 Subject: [PATCH 01/18] Running LSTM for SAC --- .../mlagents/trainers/sac/optimizer_torch.py | 100 +++++++++++++++--- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 9c3ced80a7..84df073fc0 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -1,7 +1,8 @@ import numpy as np -from typing import Dict, List, Mapping, cast, Tuple +from typing import Dict, List, Mapping, cast, Tuple, Optional import torch from torch import nn +import attr from mlagents_envs.logging_util import get_logger from mlagents_envs.base_env import ActionType @@ -56,10 +57,24 @@ def forward( self, vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], - actions: torch.Tensor = None, + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - q1_out, _ = self.q1_network(vec_inputs, vis_inputs, actions=actions) - q2_out, _ = self.q2_network(vec_inputs, vis_inputs, actions=actions) + q1_out, _ = self.q1_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + q2_out, _ = self.q2_network( + vec_inputs, + vis_inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) return q1_out, q2_out def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): @@ -87,17 +102,28 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): for name in self.stream_names } + # Critics should have 1/2 of the memory of the policy + critic_memory = policy_network_settings.memory + if critic_memory is not None: + critic_memory = attr.evolve( + critic_memory, memory_size=critic_memory.memory_size // 2 + ) + value_network_settings = attr.evolve( + policy_network_settings, memory=critic_memory + ) + self.value_network = TorchSACOptimizer.PolicyValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, - policy_network_settings, + value_network_settings, self.policy.behavior_spec.action_type, self.act_size, ) + self.target_network = ValueNetwork( self.stream_names, self.policy.behavior_spec.observation_shapes, - policy_network_settings, + value_network_settings, ) self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0) @@ -232,7 +258,6 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) - # print(log_probs, v_backup, _ent_coef, loss_masks) value_loss = 0.5 * torch.mean( loss_masks * torch.nn.functional.mse_loss(values[name], v_backup) ) @@ -369,12 +394,30 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) - memories = [ + memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] - if len(memories) > 0: - memories = torch.stack(memories).unsqueeze(0) + # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. + offset = 1 if self.policy.sequence_length > 1 else 0 + next_memories_list = [ + ModelUtils.list_to_tensor( + batch["memory"][i][: self.policy.m_size // 2] + ) # only pass value part of memory to target network + for i in range(offset, len(batch["memory"]), self.policy.sequence_length) + ] + + if len(memories_list) > 0: + memories = torch.stack(memories_list).unsqueeze(0) + next_memories = torch.stack(next_memories_list).unsqueeze(0) + else: + memories = None + next_memories = None + # Q network memories are 0'ed out, since we don't have them during inference. + q_memories = torch.zeros( + (memories.shape[0], memories.shape[1], memories.shape[2] // 2) + ) + vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: @@ -415,18 +458,45 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: ) if self.policy.use_continuous_act: squeezed_actions = actions.squeeze(-1) - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs, sampled_actions) - q1_out, q2_out = self.value_network(vec_obs, vis_obs, squeezed_actions) + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + sampled_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + squeezed_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) q1_stream, q2_stream = q1_out, q2_out else: with torch.no_grad(): - q1p_out, q2p_out = self.value_network(vec_obs, vis_obs) - q1_out, q2_out = self.value_network(vec_obs, vis_obs) + q1p_out, q2p_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + q1_out, q2_out = self.value_network( + vec_obs, + vis_obs, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) with torch.no_grad(): - target_values, _ = self.target_network(next_vec_obs, next_vis_obs) + target_values, _ = self.target_network( + next_vec_obs, + next_vis_obs, + memories=next_memories, + sequence_length=self.policy.sequence_length, + ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) From f2873b296fe60fefb86545405c6076e26d10af5e Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 10:38:19 -0700 Subject: [PATCH 02/18] Use correct half of memories --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 84df073fc0..3ba92e6c1b 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -402,7 +402,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( - batch["memory"][i][: self.policy.m_size // 2] + batch["memory"][i][self.policy.m_size // 2 :] ) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] From b97b1e535af6006bd864249693c78460bc721877 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 17:40:47 -0700 Subject: [PATCH 03/18] Fix policy memory storinig --- ml-agents/mlagents/trainers/policy/torch_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index d6dd822646..9d935d4676 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -186,7 +186,7 @@ def evaluate( run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) run_out["learning_rate"] = 0.0 if self.use_recurrent: - run_out["memories"] = memories.detach().cpu().numpy() + run_out["memory_out"] = memories.detach().cpu().numpy().squeeze(0) return run_out def get_action( From cd509ddbbdc3e4025040105f9d521034a37fdf3a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 10 Aug 2020 18:12:27 -0700 Subject: [PATCH 04/18] Fix SeparateActorCritic and add test --- ml-agents/mlagents/trainers/tests/torch/test_networks.py | 6 +++++- ml-agents/mlagents/trainers/torch/networks.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_networks.py b/ml-agents/mlagents/trainers/tests/torch/test_networks.py index ff5209b676..19030aeafc 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py @@ -203,7 +203,11 @@ def test_actor_critic(ac_type, lstm): assert value_out[stream].shape == (1,) # Test get_dist_and_value - dists, value_out, _ = actor.get_dist_and_value([sample_obs], [], memories=memories) + dists, value_out, mem_out = actor.get_dist_and_value( + [sample_obs], [], memories=memories + ) + if mem_out is not None: + assert mem_out.shape == memories.shape for dist in dists: assert isinstance(dist, GaussianDistInstance) for stream in stream_names: diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 15f0d92e2d..aff61ce0f6 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -463,7 +463,7 @@ def get_dist_and_value( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) if self.use_lstm: - mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=1) + mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1) else: mem_out = None return dists, value_outputs, mem_out From 07bb4c0818887dab74e75236f51f6ca1533bde80 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 19:15:30 -0700 Subject: [PATCH 05/18] Use loss masks in PPO. --- .../mlagents/trainers/ppo/optimizer_torch.py | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 9bbb7b51c8..13fdc12e01 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -61,12 +61,15 @@ def ppo_value_loss( old_values: Dict[str, torch.Tensor], returns: Dict[str, torch.Tensor], epsilon: float, + loss_masks: torch.Tensor, ) -> torch.Tensor: """ - Creates training-specific Tensorflow ops for PPO models. - :param returns: - :param old_values: - :param values: + Evaluates value loss for PPO. + :param values: Value output of the current network. + :param old_values: Value stored with experiences in buffer. + :param returns: Computed returns. + :param epsilon: Clipping value for value estimate. + :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ value_losses = [] for name, head in values.items(): @@ -77,18 +80,25 @@ def ppo_value_loss( ) v_opt_a = (returns_tensor - head) ** 2 v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 - value_loss = torch.mean(torch.max(v_opt_a, v_opt_b)) + masked_loss = torch.max(v_opt_a, v_opt_b) * loss_masks + value_loss = torch.mean(masked_loss) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) return value_loss - def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks): + def ppo_policy_loss( + self, + advantages: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + loss_masks: torch.Tensor, + ) -> torch.Tensor: """ - Creates training-specific Tensorflow ops for PPO models. - :param masks: - :param advantages: + Evaluate PPO policy loss. + :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities + :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) @@ -99,7 +109,8 @@ def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks): p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b)) + masked_loss = torch.min(p_opt_a, p_opt_b) * loss_masks + policy_loss = -torch.mean(masked_loss) return policy_loss @timed @@ -153,14 +164,21 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + value_loss = self.ppo_value_loss( + values, old_values, returns, decay_eps, loss_masks + ) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch["advantages"]), log_probs, ModelUtils.list_to_tensor(batch["action_probs"]), - ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32), + loss_masks, + ) + loss = ( + policy_loss + + 0.5 * value_loss + - decay_bet * torch.mean(entropy * loss_masks) ) - loss = policy_loss + 0.5 * value_loss - decay_bet * torch.mean(entropy) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) From 0a3c795cee4197003d5c7f5e96b03925e14f9896 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 19:46:34 -0700 Subject: [PATCH 06/18] Proper shape of masks --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 13fdc12e01..1232429f36 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -109,7 +109,7 @@ def ppo_policy_loss( p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - masked_loss = torch.min(p_opt_a, p_opt_b) * loss_masks + masked_loss = torch.min(p_opt_a, p_opt_b).flatten() * loss_masks policy_loss = -torch.mean(masked_loss) return policy_loss @@ -164,7 +164,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.float32) value_loss = self.ppo_value_loss( values, old_values, returns, decay_eps, loss_masks ) @@ -177,7 +177,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: loss = ( policy_loss + 0.5 * value_loss - - decay_bet * torch.mean(entropy * loss_masks) + - decay_bet * torch.mean(entropy.flatten() * loss_masks) ) # Set optimizer learning rate From 2337d15c09e2cdd91ec5d0c08eb63a2ba5d5e297 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 20:04:48 -0700 Subject: [PATCH 07/18] Proper mask mean for PPO --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 14 +++++++------- ml-agents/mlagents/trainers/torch/utils.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 1232429f36..b330bce0fb 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -80,8 +80,7 @@ def ppo_value_loss( ) v_opt_a = (returns_tensor - head) ** 2 v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 - masked_loss = torch.max(v_opt_a, v_opt_b) * loss_masks - value_loss = torch.mean(masked_loss) + value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) return value_loss @@ -109,8 +108,9 @@ def ppo_policy_loss( p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage ) - masked_loss = torch.min(p_opt_a, p_opt_b).flatten() * loss_masks - policy_loss = -torch.mean(masked_loss) + policy_loss = -1 * ModelUtils.masked_mean( + torch.min(p_opt_a, p_opt_b).flatten(), loss_masks + ) return policy_loss @timed @@ -138,7 +138,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.bool) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) @@ -164,7 +164,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=memories, seq_len=self.policy.sequence_length, ) - loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.float32) + loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) value_loss = self.ppo_value_loss( values, old_values, returns, decay_eps, loss_masks ) @@ -177,7 +177,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: loss = ( policy_loss + 0.5 * value_loss - - decay_bet * torch.mean(entropy.flatten() * loss_masks) + - decay_bet * ModelUtils.masked_mean(entropy.flatten(), loss_masks) ) # Set optimizer learning rate diff --git a/ml-agents/mlagents/trainers/torch/utils.py b/ml-agents/mlagents/trainers/torch/utils.py index baa99c98ef..ba6b7d57a0 100644 --- a/ml-agents/mlagents/trainers/torch/utils.py +++ b/ml-agents/mlagents/trainers/torch/utils.py @@ -284,3 +284,13 @@ def get_probs_and_entropy( else: all_probs = torch.cat(all_probs_list, dim=-1) return log_probs, entropies, all_probs + + @staticmethod + def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: + """ + Returns the mean of the tensor but ignoring the values specified by masks. + Used for masking out loss functions. + :param tensor: Tensor which needs mean computation. + :param masks: Boolean tensor of masks with same dimension as tensor. + """ + return (tensor * masks).sum() / masks.float().sum() From 1f69102f45dd5bfd7a0c8e63fdee701af7a25632 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 11 Aug 2020 20:17:47 -0700 Subject: [PATCH 08/18] Fix dtype for actions --- ml-agents/mlagents/trainers/ppo/optimizer_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index b330bce0fb..e162166481 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -138,7 +138,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.bool) + actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) From c0a77f76b6f9b1c6a3187b9c56e49a7228605e7b Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 14:13:17 -0700 Subject: [PATCH 09/18] Proper initialization and SAC masking --- .../mlagents/trainers/sac/optimizer_torch.py | 27 +++++++++---------- ml-agents/mlagents/trainers/torch/layers.py | 24 +++++++++++++++++ ml-agents/mlagents/trainers/torch/networks.py | 13 +++++---- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 3ba92e6c1b..6d110f99f5 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -194,11 +194,11 @@ def sac_q_loss( * self.gammas[i] * target_values[name] ) - _q1_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(q_backup, q1_stream) + _q1_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q1_stream), loss_masks ) - _q2_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(q_backup, q2_stream) + _q2_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q2_stream), loss_masks ) q1_losses.append(_q1_loss) @@ -258,8 +258,8 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) - value_loss = 0.5 * torch.mean( - loss_masks * torch.nn.functional.mse_loss(values[name], v_backup) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup), loss_masks ) value_losses.append(value_loss) else: @@ -278,9 +278,9 @@ def sac_value_loss( v_backup = min_policy_qs[name] - torch.mean( branched_ent_bonus, axis=0 ) - value_loss = 0.5 * torch.mean( - loss_masks - * torch.nn.functional.mse_loss(values[name], v_backup.squeeze()) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), + loss_masks, ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) @@ -300,7 +300,7 @@ def sac_policy_loss( if not discrete: mean_q1 = mean_q1.unsqueeze(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) - policy_loss = torch.mean(loss_masks * batch_policy_loss) + policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) else: action_probs = log_probs.exp() branched_per_action_ent = ModelUtils.break_into_branches( @@ -347,9 +347,8 @@ def sac_entropy_loss( target_current_diff = torch.squeeze( target_current_diff_branched, axis=2 ) - entropy_loss = -torch.mean( - loss_masks - * torch.mean(self._log_ent_coef * target_current_diff, axis=1) + entropy_loss = -1 * ModelUtils.masked_mean( + torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks ) return entropy_loss @@ -497,7 +496,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories=next_memories, sequence_length=self.policy.sequence_length, ) - masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.int32) + masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index 8dbb1cbcb4..d1c68887df 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -46,3 +46,27 @@ def linear_layer( layer.weight.data *= kernel_gain _init_methods[bias_init](layer.bias.data) return layer + + +def lstm_layer( + input_size: int, + hidden_size: int, + num_layers: int = 1, + batch_first: bool = True, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, +) -> torch.nn.Module: + """ + Creates a torch.nn.LSTM and initializes its weights and biases. Provides a + forget_bias offset like is done in TensorFlow. + """ + lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first) + # Add forget_bias to forget gate bias + for name, param in lstm.named_parameters(): + if "weight" in name: + _init_methods[kernel_init](param.data) + elif "bias" in name: + _init_methods[bias_init](param.data) + param.data[hidden_size : 2 * hidden_size].add_(forget_bias) + return lstm diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index aff61ce0f6..b60cfbe543 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -14,6 +14,7 @@ from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.torch.utils import ModelUtils from mlagents.trainers.torch.decoders import ValueHeads +from mlagents.trainers.torch.layers import lstm_layer ActivationFunction = Callable[[torch.Tensor], torch.Tensor] EncoderFunction = Callable[ @@ -50,7 +51,7 @@ def __init__( ) if self.use_lstm: - self.lstm = nn.LSTM(self.h_size, self.m_size // 2, 1) + self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True) else: self.lstm = None @@ -101,13 +102,11 @@ def forward( raise Exception("No valid inputs to network.") if self.use_lstm: - encoding = encoding.view([sequence_length, -1, self.h_size]) + # Resize to (batch, sequence length, encoding size) + encoding = encoding.reshape([-1, sequence_length, self.h_size]) memories = torch.split(memories, self.m_size // 2, dim=-1) - encoding, memories = self.lstm( - encoding.contiguous(), - (memories[0].contiguous(), memories[1].contiguous()), - ) - encoding = encoding.view([-1, self.m_size // 2]) + encoding, memories = self.lstm(encoding, (memories[0], memories[1])) + encoding = encoding.reshape([-1, self.m_size // 2]) memories = torch.cat(memories, dim=-1) return encoding, memories From c7ea525591ce6a5469d2a3676074399fee9ac9f3 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 18:36:23 -0700 Subject: [PATCH 10/18] Redundant indexing --- ml-agents/mlagents/trainers/torch/networks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index b60cfbe543..0d76a85107 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -105,7 +105,7 @@ def forward( # Resize to (batch, sequence length, encoding size) encoding = encoding.reshape([-1, sequence_length, self.h_size]) memories = torch.split(memories, self.m_size // 2, dim=-1) - encoding, memories = self.lstm(encoding, (memories[0], memories[1])) + encoding, memories = self.lstm(encoding, memories) encoding = encoding.reshape([-1, self.m_size // 2]) memories = torch.cat(memories, dim=-1) return encoding, memories From d8844301a218c1af432bc5b65488adb1c4f3ca9a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 18:41:31 -0700 Subject: [PATCH 11/18] Add test for lstm layer --- .../trainers/tests/torch/test_layers.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_layers.py b/ml-agents/mlagents/trainers/tests/torch/test_layers.py index 499d0de285..6d1132aa2e 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_layers.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_layers.py @@ -1,6 +1,11 @@ import torch -from mlagents.trainers.torch.layers import Swish, linear_layer, Initialization +from mlagents.trainers.torch.layers import ( + Swish, + linear_layer, + lstm_layer, + Initialization, +) def test_swish(): @@ -18,3 +23,18 @@ def test_initialization_layer(): ) assert torch.all(torch.eq(layer.weight.data, torch.zeros_like(layer.weight.data))) assert torch.all(torch.eq(layer.bias.data, torch.zeros_like(layer.bias.data))) + + +def test_lstm_layer(): + torch.manual_seed(0) + # Test zero for LSTM + layer = lstm_layer( + 4, 4, kernel_init=Initialization.Zero, bias_init=Initialization.Zero + ) + for name, param in layer.named_parameters(): + if "weight" in name: + assert torch.all(torch.eq(param.data, torch.zeros_like(param.data))) + elif "bias" in name: + assert torch.all( + torch.eq(param.data[4:8], torch.ones_like(param.data[4:8])) + ) From 5ef36484cf13cb2c0572994d4bf51704c3b097c8 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 19:38:05 -0700 Subject: [PATCH 12/18] Fix per-block lstm initialization --- ml-agents/mlagents/trainers/torch/layers.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/layers.py b/ml-agents/mlagents/trainers/torch/layers.py index d1c68887df..707d4748a5 100644 --- a/ml-agents/mlagents/trainers/torch/layers.py +++ b/ml-agents/mlagents/trainers/torch/layers.py @@ -64,9 +64,21 @@ def lstm_layer( lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first) # Add forget_bias to forget gate bias for name, param in lstm.named_parameters(): + # Each weight and bias is a concatenation of 4 matrices if "weight" in name: - _init_methods[kernel_init](param.data) - elif "bias" in name: - _init_methods[bias_init](param.data) - param.data[hidden_size : 2 * hidden_size].add_(forget_bias) + for idx in range(4): + block_size = param.shape[0] // 4 + _init_methods[kernel_init]( + param.data[idx * block_size : (idx + 1) * block_size] + ) + if "bias" in name: + for idx in range(4): + block_size = param.shape[0] // 4 + _init_methods[bias_init]( + param.data[idx * block_size : (idx + 1) * block_size] + ) + if idx == 1: + param.data[idx * block_size : (idx + 1) * block_size].add_( + forget_bias + ) return lstm From 767f399319a40dcfdff60d6a362ba2d62a505698 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 12 Aug 2020 23:55:18 -0700 Subject: [PATCH 13/18] Add memories and sequence length to critic_pass --- .../trainers/optimizer/torch_optimizer.py | 42 +++---------------- ml-agents/mlagents/trainers/torch/networks.py | 25 +++++++---- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index 7b47173db8..422295d811 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -1,7 +1,6 @@ from typing import Dict, Optional, Tuple, List import torch import numpy as np -from mlagents_envs.base_env import DecisionSteps from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.components.bc.module import BCModule @@ -10,7 +9,6 @@ from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.optimizer import Optimizer from mlagents.trainers.settings import TrainerSettings -from mlagents.trainers.trajectory import SplitObservations from mlagents.trainers.torch.utils import ModelUtils @@ -42,35 +40,6 @@ def create_reward_signals(self, reward_signal_configs): reward_signal, self.policy.behavior_spec, settings ) - def get_value_estimates( - self, decision_requests: DecisionSteps, idx: int, done: bool - ) -> Dict[str, float]: - """ - Generates value estimates for bootstrapping. - :param decision_requests: - :param idx: Index in BrainInfo of agent. - :param done: Whether or not this is the last element of the episode, - in which case the value estimate will be 0. - :return: The value estimate dictionary with key being the name of the reward signal - and the value the corresponding value estimate. - """ - vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) - - value_estimates = self.policy.actor_critic.critic_pass( - np.expand_dims(vec_vis_obs.vector_observations[idx], 0), - np.expand_dims(vec_vis_obs.visual_observations[idx], 0), - ) - - value_estimates = {k: float(v) for k, v in value_estimates.items()} - - # If we're done, reassign all of the value estimates that need terminal states. - if done: - for k in value_estimates: - if not self.reward_signals[k].ignore_done: - value_estimates[k] = 0.0 - - return value_estimates - def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: @@ -85,18 +54,17 @@ def get_trajectory_value_estimates( else: visual_obs = [] - memory = torch.zeros([1, len(vector_obs[0]), self.policy.m_size]) + memory = torch.zeros([1, 1, self.policy.m_size]) next_obs = np.concatenate(next_obs, axis=-1) next_obs = [ModelUtils.list_to_tensor(next_obs).unsqueeze(0)] - next_memory = torch.zeros([1, 1, self.policy.m_size]) - value_estimates = self.policy.actor_critic.critic_pass( - vector_obs, visual_obs, memory + value_estimates, next_memory = self.policy.actor_critic.critic_pass( + vector_obs, visual_obs, memory, sequence_length=batch.num_experiences ) - next_value_estimate = self.policy.actor_critic.critic_pass( - next_obs, next_obs, next_memory + next_value_estimate, _ = self.policy.actor_critic.critic_pass( + next_obs, next_obs, next_memory, sequence_length=1 ) for name, estimate in value_estimates.items(): diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 0d76a85107..0e206ab4a6 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -209,7 +209,8 @@ def critic_pass( vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], memories: Optional[torch.Tensor] = None, - ) -> Dict[str, torch.Tensor]: + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: """ Get value outputs for the given obs. :param vec_inputs: List of vector inputs as tensors. @@ -359,9 +360,12 @@ def critic_pass( vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], memories: Optional[torch.Tensor] = None, - ) -> Dict[str, torch.Tensor]: - encoding, _ = self.network_body(vec_inputs, vis_inputs, memories=memories) - return self.value_heads(encoding) + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + encoding, memories_out = self.network_body( + vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length + ) + return self.value_heads(encoding), memories_out def get_dist_and_value( self, @@ -426,16 +430,19 @@ def critic_pass( vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], memories: Optional[torch.Tensor] = None, - ) -> Dict[str, torch.Tensor]: + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: if self.use_lstm: # Use only the back half of memories for critic - _, critic_mem = torch.split(memories, self.half_mem_size, -1) + actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1) else: critic_mem = None - value_outputs, _memories = self.critic( - vec_inputs, vis_inputs, memories=critic_mem + value_outputs, critic_mem_out = self.critic( + vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) - return value_outputs + # Make memories with the actor mem unchanged + memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1) + return value_outputs, memories_out def get_dist_and_value( self, From c104409413d6eb3ff4fff6b895d08a193dc4a8c6 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 13 Aug 2020 10:03:37 -0700 Subject: [PATCH 14/18] Fix memory logic in SeparateActorCritic --- ml-agents/mlagents/trainers/torch/networks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/networks.py b/ml-agents/mlagents/trainers/torch/networks.py index 0e206ab4a6..bfe9c0ade3 100644 --- a/ml-agents/mlagents/trainers/torch/networks.py +++ b/ml-agents/mlagents/trainers/torch/networks.py @@ -432,16 +432,18 @@ def critic_pass( memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + actor_mem, critic_mem = None, None if self.use_lstm: # Use only the back half of memories for critic actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1) - else: - critic_mem = None value_outputs, critic_mem_out = self.critic( vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length ) - # Make memories with the actor mem unchanged - memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1) + if actor_mem is not None: + # Make memories with the actor mem unchanged + memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1) + else: + memories_out = None return value_outputs, memories_out def get_dist_and_value( From ca77ea5564ac8ba1971e9d559fe0819f239bfcea Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 13 Aug 2020 11:19:25 -0700 Subject: [PATCH 15/18] Fix LSTM tests --- .../mlagents/trainers/tests/torch/test_networks.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_networks.py b/ml-agents/mlagents/trainers/tests/torch/test_networks.py index ba954f8a97..06f8b1ab25 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py @@ -45,16 +45,16 @@ def test_networkbody_lstm(): obs_size = 4 seq_len = 16 network_settings = NetworkSettings( - memory=NetworkSettings.MemorySettings(sequence_length=seq_len, memory_size=4) + memory=NetworkSettings.MemorySettings(sequence_length=seq_len, memory_size=12) ) obs_shapes = [(obs_size,)] networkbody = NetworkBody(obs_shapes, network_settings) - optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) + optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = torch.ones((1, seq_len, obs_size)) - for _ in range(100): - encoded, _ = networkbody([sample_obs], [], memories=torch.ones(1, seq_len, 4)) + for _ in range(200): + encoded, _ = networkbody([sample_obs], [], memories=torch.ones(1, seq_len, 12)) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() @@ -196,10 +196,11 @@ def test_actor_critic(ac_type, lstm): # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass - value_out = actor.critic_pass([sample_obs], [], memories=memories) + value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == (network_settings.memory.sequence_length,) + assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1,) From fd1a4ff1227dd6534116c1cc88e2f20c7554bd0a Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 13 Aug 2020 16:04:44 -0700 Subject: [PATCH 16/18] Fix next_obs in get_trajectory_value_estimates --- .../mlagents/trainers/optimizer/torch_optimizer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index 422295d811..8261b363d7 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -3,6 +3,7 @@ import numpy as np from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.trajectory import SplitObservations from mlagents.trainers.components.bc.module import BCModule from mlagents.trainers.torch.components.reward_providers import create_reward_provider @@ -56,15 +57,21 @@ def get_trajectory_value_estimates( memory = torch.zeros([1, 1, self.policy.m_size]) - next_obs = np.concatenate(next_obs, axis=-1) - next_obs = [ModelUtils.list_to_tensor(next_obs).unsqueeze(0)] + vec_vis_obs = SplitObservations.from_observations(next_obs) + next_vec_obs = [ + ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0) + ] + next_vis_obs = [ + ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0) + for _vis_ob in vec_vis_obs.visual_observations + ] value_estimates, next_memory = self.policy.actor_critic.critic_pass( vector_obs, visual_obs, memory, sequence_length=batch.num_experiences ) next_value_estimate, _ = self.policy.actor_critic.critic_pass( - next_obs, next_obs, next_memory, sequence_length=1 + next_vec_obs, next_vis_obs, next_memory, sequence_length=1 ) for name, estimate in value_estimates.items(): From 545dd9cf010dcef66925c89bc552d5686af009f0 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 13 Aug 2020 16:12:22 -0700 Subject: [PATCH 17/18] Fix and test for masked_mean --- .../mlagents/trainers/tests/torch/test_utils.py | 16 ++++++++++++++++ ml-agents/mlagents/trainers/torch/utils.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_utils.py b/ml-agents/mlagents/trainers/tests/torch/test_utils.py index 70306a89f3..0275581d08 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_utils.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_utils.py @@ -198,3 +198,19 @@ def test_get_probs_and_entropy(): assert entropies.shape == (1, len(dist_list)) # Make sure the first action has high probability than the others. assert log_probs.flatten()[0] > log_probs.flatten()[1] + + +def test_masked_mean(): + test_input = torch.tensor([1, 2, 3, 4, 5]) + masks = torch.ones_like(test_input).bool() + mean = ModelUtils.masked_mean(test_input, masks=masks) + assert mean == 3.0 + + masks = torch.tensor([False, False, True, True, True]) + mean = ModelUtils.masked_mean(test_input, masks=masks) + assert mean == 4.0 + + # Make sure it works if all masks are off + masks = torch.tensor([False, False, False, False, False]) + mean = ModelUtils.masked_mean(test_input, masks=masks) + assert mean == 0.0 diff --git a/ml-agents/mlagents/trainers/torch/utils.py b/ml-agents/mlagents/trainers/torch/utils.py index ba6b7d57a0..0e855ea79b 100644 --- a/ml-agents/mlagents/trainers/torch/utils.py +++ b/ml-agents/mlagents/trainers/torch/utils.py @@ -293,4 +293,4 @@ def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: :param tensor: Tensor which needs mean computation. :param masks: Boolean tensor of masks with same dimension as tensor. """ - return (tensor * masks).sum() / masks.float().sum() + return (tensor * masks).sum() / torch.clamp(masks.float().sum(), min=1.0) From 790ce905cd25f4765bfff390853ff6e2244fca63 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Thu, 13 Aug 2020 16:17:50 -0700 Subject: [PATCH 18/18] Use zeros_like --- ml-agents/mlagents/trainers/sac/optimizer_torch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index 6d110f99f5..2a26d715f5 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -413,9 +413,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. - q_memories = torch.zeros( - (memories.shape[0], memories.shape[1], memories.shape[2] // 2) - ) + q_memories = torch.zeros_like(next_memories) vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = []