Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions ml-agents-envs/mlagents_envs/gym_to_unity_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from mlagents_envs.base_env import (
BaseEnv,
DecisionSteps,
TerminalSteps,
BehaviorSpec,
BehaviorName,
AgentId,
ActionType,
BehaviorMapping,
)
from mlagents_envs.exception import UnityActionException, UnityObservationException

from typing import Tuple, Union, Optional

import numpy as np

import gym


class GymToUnityWrapper(BaseEnv):
_DEFAULT_BEHAVIOR_NAME = "gym_behavior_name"
_AGENT_ID = 1

def __init__(self, gym_env: gym.Env, name: Optional[str] = None):
"""
Wrapper construction. Creates an implementation of a Unity BaseEnv from a gym
environment.
:gym.Env gym_env: The gym environment that will be wrapped.
:str name: [Optional] The name of the gym environment. This will become the
name of the behavior for the BaseEnv.
"""
self._gym_env = gym_env
self._first_message = True
if name is None:
self._behavior_name = self._DEFAULT_BEHAVIOR_NAME
else:
self._behavior_name = name
action_type = ActionType.CONTINUOUS
action_shape: Union[Tuple[int, ...], int] = 0
if isinstance(self._gym_env.action_space, gym.spaces.Box):
action_type = ActionType.CONTINUOUS
action_shape = np.prod(self._gym_env.action_space.shape)
self._act_ratio = np.maximum(
self._gym_env.action_space.high, -self._gym_env.action_space.low
)
self._act_ratio[self._act_ratio > 1e38] = 1
elif isinstance(self._gym_env.action_space, gym.spaces.Discrete):
action_shape = (self._gym_env.action_space.n,)
action_type = ActionType.DISCRETE
else:
raise UnityActionException(
f"Unknown action type {self._gym_env.action_space}"
)
if not isinstance(self._gym_env.observation_space, gym.spaces.Box):
raise UnityObservationException(
f"Unknown observation type {self._gym_env.observation_space}"
)
self._obs_ratio = np.maximum(
self._gym_env.observation_space.high, -self._gym_env.observation_space.low
)
# If the range is infinity, just don't normalize
self._obs_ratio[self._obs_ratio > 1e38] = 1
self._behavior_specs = BehaviorSpec(
observation_shapes=[self._gym_env.observation_space.shape],
action_type=action_type,
action_shape=action_shape,
)
self._g_action: np.ndarray = None
self._current_steps: Tuple[DecisionSteps, TerminalSteps] = (
DecisionSteps.empty(self._behavior_specs),
TerminalSteps.empty(self._behavior_specs),
)

@property
def behavior_specs(self) -> BehaviorMapping:
return BehaviorMapping({self._behavior_name: self._behavior_specs})

def step(self) -> None:
if self._first_message:
self.reset()
return
obs, rew, done, info = self._gym_env.step(self._g_action)
if not done:
self._current_steps = (
DecisionSteps(
obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
reward=np.array([rew], dtype=np.float32),
agent_id=np.array([self._AGENT_ID], dtype=np.int32),
action_mask=None,
),
TerminalSteps.empty(self._behavior_specs),
)
else:
self._first_message = True
self._current_steps = (
DecisionSteps.empty(self._behavior_specs),
TerminalSteps(
obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
reward=np.array([rew], dtype=np.float32),
interrupted=np.array(
[info.get("TimeLimit.truncated", False)], dtype=np.bool
),
agent_id=np.array([self._AGENT_ID], dtype=np.int32),
),
)

def reset(self) -> None:
self._first_message = False
obs = self._gym_env.reset()
self._current_steps = (
DecisionSteps(
obs=[np.expand_dims(obs / self._obs_ratio, axis=0)],
reward=np.array([0], dtype=np.float32),
agent_id=np.array([self._AGENT_ID], dtype=np.int32),
action_mask=None,
),
TerminalSteps.empty(self._behavior_specs),
)

def close(self) -> None:
self._gym_env.close()

def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
assert behavior_name == self._behavior_name
spec = self._behavior_specs
expected_type = np.float32 if spec.is_action_continuous() else np.int32
n_agents = len(self._current_steps[0])
if n_agents == 0:
return
expected_shape = (n_agents, spec.action_size)
if action.shape != expected_shape:
raise UnityActionException(
"The behavior {0} needs an input of dimension {1} but received input of dimension {2}".format(
behavior_name, expected_shape, action.shape
)
)
if action.dtype != expected_type:
action = action.astype(expected_type)
if isinstance(self._gym_env.action_space, gym.spaces.Discrete):
self._g_action = int(action[0, 0])
elif isinstance(self._gym_env.action_space, gym.spaces.Box):
self._g_action = action[0] / self._act_ratio
else:
raise UnityActionException(
f"Unknown action type {self._gym_env.action_space}"
)

def set_action_for_agent(
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
) -> None:
assert behavior_name == self._behavior_name
assert agent_id == self._AGENT_ID
spec = self._behavior_specs
expected_shape = (spec.action_size,)
if action.shape != expected_shape:
raise UnityActionException(
f"The Agent {0} with BehaviorName {1} needs an input of dimension "
f"{2} but received input of dimension {3}".format(
agent_id, behavior_name, expected_shape, action.shape
)
)
expected_type = np.float32 if spec.is_action_continuous() else np.int32
if action.dtype != expected_type:
action = action.astype(expected_type)
if isinstance(self._gym_env.action_space, gym.spaces.Discrete):
self._g_action = int(action[0])
elif isinstance(self._gym_env.action_space, gym.spaces.Box):
self._g_action = action / self._act_ratio
else:
raise UnityActionException(
f"Unknown action type {self._gym_env.action_space}"
)

def get_steps(
self, behavior_name: BehaviorName
) -> Tuple[DecisionSteps, TerminalSteps]:
assert behavior_name == self._behavior_name
return self._current_steps
40 changes: 40 additions & 0 deletions ml-agents-envs/mlagents_envs/tests/test_gym_to_unity_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from mlagents_envs.gym_to_unity_wrapper import GymToUnityWrapper
from mlagents_envs.base_env import ActionType
import gym

import pytest


GYM_ENVS = ["CartPole-v1", "MountainCar-v0"]


@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
def test_creation(name):
env = GymToUnityWrapper(gym.make(name), name)
env.close()


@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
def test_specs(name):
gym_env = gym.make(name)
env = GymToUnityWrapper(gym_env, name)
assert list(env.behavior_specs.keys()) == [name]
if isinstance(gym_env.action_space, gym.spaces.Box):
assert env.behavior_specs[name].action_type == ActionType.CONTINUOUS
elif isinstance(gym_env.action_space, gym.spaces.Discrete):
assert env.behavior_specs[name].action_type == ActionType.DISCRETE
else:
raise NotImplementedError("Test for this action space type not implemented")
env.close()


@pytest.mark.parametrize("name", GYM_ENVS, ids=GYM_ENVS)
def test_steps(name):
env = GymToUnityWrapper(gym.make(name), name)
spec = env.behavior_specs[name]
env.reset()
for _ in range(200):
d_steps, t_steps = env.get_steps(name)
env.set_actions(name, spec.create_empty_action(len(d_steps)))
env.step()
env.close()
1 change: 1 addition & 0 deletions ml-agents-envs/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def run(self):
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
zip_safe=False,
install_requires=[
"gym",
"cloudpickle",
"grpcio>=1.11.0",
"numpy>=1.14.1,<2.0",
Expand Down
45 changes: 41 additions & 4 deletions ml-agents/mlagents/trainers/tests/test_simple_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents.trainers.settings import (
Expand All @@ -36,6 +37,10 @@
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous

import gym
from mlagents_envs.gym_to_unity_wrapper import GymToUnityWrapper
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig

BRAIN_NAME = "1D"


Expand Down Expand Up @@ -138,15 +143,18 @@ def _check_environment_trains(
train=True,
training_seed=seed,
)

# Begin training
tc.start_learning(env_manager)
env_manager.close()
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
if hasattr(env, "final_rewards"):
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
else:
processed_rewards = list(debug_writer.get_last_rewards().values())
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)

Expand Down Expand Up @@ -497,3 +505,32 @@ def test_gail_visual_sac(simple_record, use_discrete):
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


@pytest.mark.gym
@pytest.mark.parametrize(
"gym_name,target_return",
[
pytest.param("CartPole-v0", 150), # optimal 200
# pytest.param("MountainCar-v0", -199), # solved if more than -200
# pytest.param("MountainCarContinuous-v0", 0), # optimal 90
],
)
def test_sac_gym_training(gym_name, target_return, pytestconfig):
if "gym" not in pytestconfig.getoption(name="-m", skip=False):
raise pytest.skip(
"Dit not run the gym tests, add the marker gym to run these tests"
)
env = GymToUnityWrapper(gym.make(gym_name), BRAIN_NAME)
hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, learning_rate=3e-4, buffer_size=1000
)
config = attr.evolve(SAC_CONFIG, hyperparameters=hyperparams, max_steps=50000)

def factory(worker_id, side_channels):
return GymToUnityWrapper(gym.make(gym_name), BRAIN_NAME)

manager = SubprocessEnvManager(factory, EngineConfig.default_config(), 30)
_check_environment_trains(
env, {BRAIN_NAME: config}, success_threshold=target_return, env_manager=manager
)