diff --git a/ml-agents/mlagents/trainers/tests/test_environments/__init__.py b/ml-agents/mlagents/trainers/tests/test_environments/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ml-agents/mlagents/trainers/tests/test_environments/test_simple.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py similarity index 81% rename from ml-agents/mlagents/trainers/tests/test_environments/test_simple.py rename to ml-agents/mlagents/trainers/tests/test_simple_rl.py index b9ebe4db1a..cc3a5bed0e 100644 --- a/ml-agents/mlagents/trainers/tests/test_environments/test_simple.py +++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py @@ -1,6 +1,8 @@ -import yaml import math +import random import tempfile +import pytest +import yaml from typing import Any, Dict @@ -31,21 +33,25 @@ class Simple1DEnvironment(BaseUnityEnvironment): it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]). """ - def __init__(self): + def __init__(self, use_discrete): + super().__init__() + self.discrete = use_discrete self._brains: Dict[str, BrainParameters] = {} self._brains[BRAIN_NAME] = BrainParameters( brain_name=BRAIN_NAME, vector_observation_space_size=OBS_SIZE, num_stacked_vector_observations=1, camera_resolutions=[], - vector_action_space_size=[1], + vector_action_space_size=[2] if use_discrete else [1], vector_action_descriptions=["moveDirection"], - vector_action_space_type=1, # "continuous" + vector_action_space_type=0 if use_discrete else 1, ) # state self.position = 0.0 self.step_count = 0 + self.random = random.Random(str(self._brains)) + self.goal = random.choice([-1, 1]) def step( self, @@ -56,21 +62,23 @@ def step( ) -> AllBrainInfo: assert vector_action is not None - delta = vector_action[BRAIN_NAME][0][0] + if self.discrete: + act = vector_action[BRAIN_NAME][0][0] + delta = 1 if act else -1 + else: + delta = vector_action[BRAIN_NAME][0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: - reward = SUCCESS_REWARD * self.position + reward = SUCCESS_REWARD * self.position * self.goal else: reward = -TIME_PENALTY agent_info = AgentInfoProto( - stacked_vector_observation=[self.position] * OBS_SIZE, - reward=reward, - done=done, + stacked_vector_observation=[self.goal] * OBS_SIZE, reward=reward, done=done ) if done: @@ -85,6 +93,7 @@ def step( def _reset_agent(self): self.position = 0.0 self.step_count = 0 + self.goal = random.choice([-1, 1]) def reset( self, @@ -95,7 +104,7 @@ def reset( self._reset_agent() agent_info = AgentInfoProto( - stacked_vector_observation=[self.position] * OBS_SIZE, + stacked_vector_observation=[self.goal] * OBS_SIZE, done=False, max_step_reached=False, ) @@ -121,7 +130,7 @@ def close(self): pass -def test_simple(): +def _check_environment_trains(env): config = """ default: trainer: ppo @@ -167,7 +176,6 @@ def test_simple(): ) # Begin training - env = Simple1DEnvironment() env_manager = SimpleEnvManager(env) trainer_config = yaml.safe_load(config) tc.start_learning(env_manager, trainer_config) @@ -175,3 +183,9 @@ def test_simple(): for brain_name, mean_reward in tc._get_measure_vals().items(): assert not math.isnan(mean_reward) assert mean_reward > 0.99 + + +@pytest.mark.parametrize("use_discrete", [True, False]) +def test_simple_rl(use_discrete): + env = Simple1DEnvironment(use_discrete=use_discrete) + _check_environment_trains(env)