Unity-Technologies · vincentpierre · Apr 16, 2020 · Apr 16, 2020 · Apr 17, 2020 · Apr 17, 2020
diff --git a/ml-agents-envs/mlagents_envs/gym_to_unity_wrapper.py b/ml-agents-envs/mlagents_envs/gym_to_unity_wrapper.py
@@ -0,0 +1,178 @@
+from mlagents_envs.base_env import (
+    BaseEnv,
+    DecisionSteps,
+    TerminalSteps,
+    BehaviorSpec,
+    BehaviorName,
+    AgentId,
+    ActionType,
+    BehaviorMapping,
+)
+from mlagents_envs.exception import UnityActionException, UnityObservationException
+
+from typing import Tuple, Union, Optional
+
+import numpy as np
+
+import gym
+
+
+class GymToUnityWrapper(BaseEnv):
+    _DEFAULT_BEHAVIOR_NAME = "gym_behavior_name"
+    _AGENT_ID = 1
+
+    def __init__(self, gym_env: gym.Env, name: Optional[str] = None):
+        """
+        Wrapper construction. Creates an implementation of a Unity BaseEnv from a gym
+        environment.
+        :gym.Env gym_env: The gym environment that will be wrapped.
+        :str name: [Optional] The name of the gym environment. This will become the
+        name of the behavior for the BaseEnv.
+        """
+        self._gym_env = gym_env
+        self._first_message = True
+        if name is None:
+            self._behavior_name = self._DEFAULT_BEHAVIOR_NAME
+        else:
+            self._behavior_name = name
+        action_type = ActionType.CONTINUOUS
+        action_shape: Union[Tuple[int, ...], int] = 0
+        if isinstance(self._gym_env.action_space, gym.spaces.Box):
+            action_type = ActionType.CONTINUOUS
+            action_shape = np.prod(self._gym_env.action_space.shape)
+            self._act_ratio = np.maximum(
+                self._gym_env.action_space.high, -self._gym_env.action_space.low
+            )
+            self._act_ratio[self._act_ratio > 1e38] = 1
+        elif isinstance(self._gym_env.action_space, gym.spaces.Discrete):
+            action_shape = (self._gym_env.action_space.n,)
+            action_type = ActionType.DISCRETE
+        else:
+            raise UnityActionException(
+                f"Unknown action type {self._gym_env.action_space}"
+            )
+        if not isinstance(self._gym_env.observation_space, gym.spaces.Box):
+            raise UnityObservationException(
+                f"Unknown observation type {self._gym_env.observation_space}"
+            )
+        self._obs_ratio = np.maximum(
+            self._gym_env.observation_space.high, -self._gym_env.observation_space.low
+        )
+        # If the range is infinity, just don't normalize
+        self._obs_ratio[self._obs_ratio > 1e38] = 1
+        self._behavior_specs = BehaviorSpec(
+            observation_shapes=[self._gym_env.observation_space.shape],
+            action_type=action_type,
+            action_shape=action_shape,
+        )
+        self._g_action: np.ndarray = None
+        self._current_steps: Tuple[DecisionSteps, TerminalSteps] = (
+            DecisionSteps.empty(self._behavior_specs),
+            TerminalSteps.empty(self._behavior_specs),
+        )
+
+    @property
+    def behavior_specs(self) -> BehaviorMapping:
+        return BehaviorMapping({self._behavior_name: self._behavior_specs})
+
+    def step(self) -> None:
+        if self._first_message:
+            self.reset()
+            return
+        obs, rew, done, info = self._gym_env.step(self._g_action)
+        if not done:
+            self._current_steps = (
+                DecisionSteps(
+                    obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
+                    reward=np.array([rew], dtype=np.float32),
+                    agent_id=np.array([self._AGENT_ID], dtype=np.int32),
+                    action_mask=None,
+                ),
+                TerminalSteps.empty(self._behavior_specs),
+            )
+        else:
+            self._first_message = True
+            self._current_steps = (
+                DecisionSteps.empty(self._behavior_specs),
+                TerminalSteps(
+                    obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
+                    reward=np.array([rew], dtype=np.float32),
+                    interrupted=np.array(
+                        [info.get("TimeLimit.truncated", False)], dtype=np.bool
+                    ),
+                    agent_id=np.array([self._AGENT_ID], dtype=np.int32),
+                ),
+            )
+
+    def reset(self) -> None:
+        self._first_message = False
+        obs = self._gym_env.reset()
+        self._current_steps = (
+            DecisionSteps(
+                obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
+                reward=np.array([0], dtype=np.float32),
+                agent_id=np.array([self._AGENT_ID], dtype=np.int32),
+                action_mask=None,
+            ),
+            TerminalSteps.empty(self._behavior_specs),
+        )
+
+    def close(self) -> None:
+        self._gym_env.close()
+
+    def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
+        assert behavior_name == self._behavior_name
+        spec = self._behavior_specs
+        expected_type = np.float32 if spec.is_action_continuous() else np.int32
+        n_agents = len(self._current_steps[0])
+        if n_agents == 0:
+            return
+        expected_shape = (n_agents, spec.action_size)
+        if action.shape != expected_shape:
+            raise UnityActionException(
+                "The behavior {0} needs an input of dimension {1} but received input of dimension {2}".format(
+                    behavior_name, expected_shape, action.shape
+                )
+            )
+        if action.dtype != expected_type:
+            action = action.astype(expected_type)
+        if isinstance(self._gym_env.action_space, gym.spaces.Discrete):
+            self._g_action = int(action[0, 0])
+        elif isinstance(self._gym_env.action_space, gym.spaces.Box):
+            self._g_action = action[0] / self._act_ratio
+        else:
+            raise UnityActionException(
+                f"Unknown action type {self._gym_env.action_space}"
+            )
+
+    def set_action_for_agent(
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
+    ) -> None:
+        assert behavior_name == self._behavior_name
+        assert agent_id == self._AGENT_ID
+        spec = self._behavior_specs
+        expected_shape = (spec.action_size,)
+        if action.shape != expected_shape:
+            raise UnityActionException(
+                f"The Agent {0} with BehaviorName {1} needs an input of dimension "
+                f"{2} but received input of dimension {3}".format(
+                    agent_id, behavior_name, expected_shape, action.shape
+                )
+            )
+        expected_type = np.float32 if spec.is_action_continuous() else np.int32
+        if action.dtype != expected_type:
+            action = action.astype(expected_type)
+        if isinstance(self._gym_env.action_space, gym.spaces.Discrete):
+            self._g_action = int(action[0])
+        elif isinstance(self._gym_env.action_space, gym.spaces.Box):
+            self._g_action = action / self._act_ratio
+        else:
+            raise UnityActionException(
+                f"Unknown action type {self._gym_env.action_space}"
+            )
+
+    def get_steps(
+        self, behavior_name: BehaviorName
+    ) -> Tuple[DecisionSteps, TerminalSteps]:
+        assert behavior_name == self._behavior_name
+        return self._current_steps
diff --git a/ml-agents-envs/mlagents_envs/tests/test_gym_to_unity_wrapper.py b/ml-agents-envs/mlagents_envs/tests/test_gym_to_unity_wrapper.py
@@ -0,0 +1,40 @@
+from mlagents_envs.gym_to_unity_wrapper import GymToUnityWrapper
+from mlagents_envs.base_env import ActionType
+import gym
+
+import pytest
+
+
+GYM_ENVS = ["CartPole-v1", "MountainCar-v0"]
+
+
+@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
+def test_creation(name):
+    env = GymToUnityWrapper(gym.make(name), name)
+    env.close()
+
+
+@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
+def test_specs(name):
+    gym_env = gym.make(name)
+    env = GymToUnityWrapper(gym_env, name)
+    assert list(env.behavior_specs.keys()) == [name]
+    if isinstance(gym_env.action_space, gym.spaces.Box):
+        assert env.behavior_specs[name].action_type == ActionType.CONTINUOUS
+    elif isinstance(gym_env.action_space, gym.spaces.Discrete):
+        assert env.behavior_specs[name].action_type == ActionType.DISCRETE
+    else:
+        raise NotImplementedError("Test for this action space type not implemented")
+    env.close()
+
+
+@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
+def test_steps(name):
+    env = GymToUnityWrapper(gym.make(name), name)
+    spec = env.behavior_specs[name]
+    env.reset()
+    for _ in range(200):
+        d_steps, t_steps = env.get_steps(name)
+        env.set_actions(name, spec.create_empty_action(len(d_steps)))
+        env.step()
+    env.close()
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
@@ -46,6 +46,7 @@ def run(self):
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     zip_safe=False,
     install_requires=[
+        "gym",
         "cloudpickle",
         "grpcio>=1.11.0",
         "numpy>=1.14.1,<2.0",

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -13,6 +13,7 @@
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.trainer_util import TrainerFactory
 from mlagents.trainers.simple_env_manager import SimpleEnvManager
+from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
 from mlagents.trainers.demo_loader import write_demo
 from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
 from mlagents.trainers.settings import (
@@ -36,6 +37,10 @@
 from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
 from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
 
+import gym
+from mlagents_envs.gym_to_unity_wrapper import GymToUnityWrapper
+from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
+
 BRAIN_NAME = "1D"
 
 
@@ -138,15 +143,18 @@ def _check_environment_trains(
             train=True,
             training_seed=seed,
         )
-
         # Begin training
         tc.start_learning(env_manager)
+        env_manager.close()
         if (
             success_threshold is not None
         ):  # For tests where we are just checking setup and not reward
-            processed_rewards = [
-                reward_processor(rewards) for rewards in env.final_rewards.values()
-            ]
+            if hasattr(env, "final_rewards"):
+                processed_rewards = [
+                    reward_processor(rewards) for rewards in env.final_rewards.values()
+                ]
+            else:
+                processed_rewards = list(debug_writer.get_last_rewards().values())
             assert all(not math.isnan(reward) for reward in processed_rewards)
             assert all(reward > success_threshold for reward in processed_rewards)
 
@@ -497,3 +505,32 @@ def test_gail_visual_sac(simple_record, use_discrete):
         max_steps=500,
     )
     _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+
+
+@pytest.mark.gym
+@pytest.mark.parametrize(
+    "gym_name,target_return",
+    [
+        pytest.param("CartPole-v0", 150),  # optimal 200
+        # pytest.param("MountainCar-v0", -199),  # solved if more than -200
+        # pytest.param("MountainCarContinuous-v0", 0),  # optimal 90
+    ],
+)
+def test_sac_gym_training(gym_name, target_return, pytestconfig):
+    if "gym" not in pytestconfig.getoption(name="-m", skip=False):
+        raise pytest.skip(
+            "Dit not run the gym tests, add the marker gym to run these tests"
+        )
+    env = GymToUnityWrapper(gym.make(gym_name), BRAIN_NAME)
+    hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters, learning_rate=3e-4, buffer_size=1000
+    )
+    config = attr.evolve(SAC_CONFIG, hyperparameters=hyperparams, max_steps=50000)
+
+    def factory(worker_id, side_channels):
+        return GymToUnityWrapper(gym.make(gym_name), BRAIN_NAME)
+
+    manager = SubprocessEnvManager(factory, EngineConfig.default_config(), 30)
+    _check_environment_trains(
+        env, {BRAIN_NAME: config}, success_threshold=target_return, env_manager=manager
+    )