From a1c442760b5ca169a17da38df6ed3dbdce6ff7e2 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 17 Aug 2020 16:34:27 -0700 Subject: [PATCH 1/4] Policy bugfixes and policy tests --- .../mlagents/trainers/policy/torch_policy.py | 20 +-- .../trainers/tests/torch/test_policy.py | 151 ++++++++++++++++++ 2 files changed, 162 insertions(+), 9 deletions(-) create mode 100644 ml-agents/mlagents/trainers/tests/torch/test_policy.py diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 5fa135f6a2..e191ea8054 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple import numpy as np import torch @@ -85,7 +85,9 @@ def __init__( self.actor_critic.to(TestingConfiguration.device) - def split_decision_step(self, decision_requests): + def _split_decision_step( + self, decision_requests: DecisionSteps + ) -> Tuple[SplitObservations, np.ndarray]: vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) mask = None if not self.use_continuous_act: @@ -94,7 +96,7 @@ def split_decision_step(self, decision_requests): mask = torch.as_tensor( 1 - np.concatenate(decision_requests.action_mask, axis=1) ) - return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask + return vec_vis_obs, mask def update_normalization(self, vector_obs: np.ndarray) -> None: """ @@ -145,9 +147,7 @@ def evaluate_actions( dists, value_heads, _ = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len ) - if len(actions.shape) <= 2: - actions = actions.unsqueeze(-1) - action_list = [actions[..., i] for i in range(actions.shape[2])] + action_list = [actions[..., i] for i in range(actions.shape[-1])] log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists) return log_probs, entropies, value_heads @@ -162,9 +162,11 @@ def evaluate( :param decision_requests: DecisionStep object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ - vec_obs, vis_obs, masks = self.split_decision_step(decision_requests) - vec_obs = [torch.as_tensor(vec_obs)] - vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vis_obs] + vec_vis_obs, masks = self._split_decision_step(decision_requests) + vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] + vis_obs = [ + torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations + ] memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze( 0 ) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_policy.py b/ml-agents/mlagents/trainers/tests/torch/test_policy.py new file mode 100644 index 0000000000..d875056627 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py @@ -0,0 +1,151 @@ +import pytest +from typing import Tuple + +import torch +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.tests import mock_brain as mb +from mlagents.trainers.settings import TrainerSettings, NetworkSettings +from mlagents.trainers.torch.utils import ModelUtils + +VECTOR_ACTION_SPACE = 2 +VECTOR_OBS_SPACE = 8 +DISCRETE_ACTION_SPACE = [3, 3, 3, 2] +BUFFER_INIT_SAMPLES = 32 +NUM_AGENTS = 12 +EPSILON = 1e-7 + + +def create_policy_mock( + dummy_config: TrainerSettings, + use_rnn: bool = False, + use_discrete: bool = True, + use_visual: bool = False, + seed: int = 0, +) -> Tuple[TorchPolicy, TrainerSettings]: + mock_spec = mb.setup_test_behavior_specs( + use_discrete, + use_visual, + vector_action_space=DISCRETE_ACTION_SPACE + if use_discrete + else VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + ) + + trainer_settings = dummy_config + trainer_settings.keep_checkpoints = 3 + trainer_settings.network_settings.memory = ( + NetworkSettings.MemorySettings() if use_rnn else None + ) + policy = TorchPolicy(seed, mock_spec, trainer_settings) + return policy, trainer_settings + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_policy_evaluate(rnn, visual, discrete): + # Test evaluate + policy, _ = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + decision_step, terminal_step = mb.create_steps_from_behavior_spec( + policy.behavior_spec, num_agents=NUM_AGENTS + ) + + run_out = policy.evaluate(decision_step, list(decision_step.agent_id)) + if discrete: + run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) + else: + assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE) + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_evaluate_actions(rnn, visual, discrete): + policy, trainer_settings = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) + vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] + act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) + if policy.use_continuous_act: + actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1) + else: + actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long) + vis_obs = [] + for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders): + vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + + memories = [ + ModelUtils.list_to_tensor(buffer["memory"][i]) + for i in range(0, len(buffer["memory"]), policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + log_probs, entropy, values = policy.evaluate_actions( + vec_obs, + vis_obs, + masks=act_masks, + actions=actions, + memories=memories, + seq_len=policy.sequence_length, + ) + assert log_probs.shape == (64, policy.behavior_spec.action_size) + assert entropy.shape == (64, policy.behavior_spec.action_size) + for val in values.values(): + assert val.shape == (64,) + + +@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) +@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) +@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) +def test_sample_actions(rnn, visual, discrete): + policy, trainer_settings = create_policy_mock( + TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual + ) + buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) + vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] + act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) + + vis_obs = [] + for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders): + vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) + vis_obs.append(vis_ob) + + memories = [ + ModelUtils.list_to_tensor(buffer["memory"][i]) + for i in range(0, len(buffer["memory"]), policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + ( + sampled_actions, + log_probs, + entropies, + sampled_values, + memories, + ) = policy.sample_actions( + vec_obs, + vis_obs, + masks=act_masks, + memories=memories, + seq_len=policy.sequence_length, + all_log_probs=not policy.use_continuous_act, + ) + if discrete: + assert log_probs.shape == ( + 64, + sum(policy.behavior_spec.discrete_action_branches), + ) + else: + assert log_probs.shape == (64, policy.behavior_spec.action_shape) + assert entropies.shape == (64, policy.behavior_spec.action_size) + for val in sampled_values.values(): + assert val.shape == (64,) + + if rnn: + assert memories.shape == (1, 1, policy.m_size) From dfea837b082e3f7a6645b037fe796dda5b60b4ca Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 17 Aug 2020 16:37:29 -0700 Subject: [PATCH 2/4] Typing for torch policy --- .../mlagents/trainers/policy/torch_policy.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index e191ea8054..8b17a98221 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, Optional import numpy as np import torch @@ -110,13 +110,15 @@ def update_normalization(self, vector_obs: np.ndarray) -> None: @timed def sample_actions( self, - vec_obs, - vis_obs, - masks=None, - memories=None, - seq_len=1, - all_log_probs=False, - ): + vec_obs: List[torch.Tensor], + vis_obs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + seq_len: int = 1, + all_log_probs: bool = False, + ) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor + ]: """ :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. """ @@ -142,8 +144,14 @@ def sample_actions( ) def evaluate_actions( - self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1 - ): + self, + vec_obs: torch.Tensor, + vis_obs: torch.Tensor, + actions: torch.Tensor, + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + seq_len: int = 1, + ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]: dists, value_heads, _ = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len ) From f954a73b9ec5893aee52d84e645f38bd580ad231 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 17 Aug 2020 16:43:13 -0700 Subject: [PATCH 3/4] Remove some unneeded stuff --- ml-agents/mlagents/trainers/tests/torch/test_policy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_policy.py b/ml-agents/mlagents/trainers/tests/torch/test_policy.py index d875056627..a849a948a4 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py @@ -37,7 +37,7 @@ def create_policy_mock( NetworkSettings.MemorySettings() if use_rnn else None ) policy = TorchPolicy(seed, mock_spec, trainer_settings) - return policy, trainer_settings + return policy @pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) @@ -45,7 +45,7 @@ def create_policy_mock( @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) def test_policy_evaluate(rnn, visual, discrete): # Test evaluate - policy, _ = create_policy_mock( + policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) decision_step, terminal_step = mb.create_steps_from_behavior_spec( @@ -63,7 +63,7 @@ def test_policy_evaluate(rnn, visual, discrete): @pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) def test_evaluate_actions(rnn, visual, discrete): - policy, trainer_settings = create_policy_mock( + policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) @@ -103,7 +103,7 @@ def test_evaluate_actions(rnn, visual, discrete): @pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) @pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) def test_sample_actions(rnn, visual, discrete): - policy, trainer_settings = create_policy_mock( + policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) From 01219e62bc3e55147d07fd14f537b162b1db1d2d Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 17 Aug 2020 18:02:16 -0700 Subject: [PATCH 4/4] Fix test typing --- ml-agents/mlagents/trainers/tests/torch/test_policy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_policy.py b/ml-agents/mlagents/trainers/tests/torch/test_policy.py index a849a948a4..208a208b77 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py @@ -1,5 +1,4 @@ import pytest -from typing import Tuple import torch from mlagents.trainers.policy.torch_policy import TorchPolicy @@ -21,7 +20,7 @@ def create_policy_mock( use_discrete: bool = True, use_visual: bool = False, seed: int = 0, -) -> Tuple[TorchPolicy, TrainerSettings]: +) -> TorchPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual,