diff --git a/.circleci/config.yml b/.circleci/config.yml index dfb28bd6fd..e7a56ffd24 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -70,7 +70,7 @@ jobs: . venv/bin/activate mkdir test-reports pip freeze > test-reports/pip_versions.txt - pytest -n 2 --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings + pytest --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings - run: name: Verify there are no hidden/missing metafiles. diff --git a/experiment_torch.py b/experiment_torch.py index 4fcdf8a2de..6fe65c15d7 100644 --- a/experiment_torch.py +++ b/experiment_torch.py @@ -98,8 +98,8 @@ def run_experiment( evaluate_count = evaluate["TorchPolicy.evaluate"]["count"] else: if algo == "ppo": - update_total = update["TFPPOOptimizer.update"]["total"] - update_count = update["TFPPOOptimizer.update"]["count"] + update_total = update["PPOOptimizer.update"]["total"] + update_count = update["PPOOptimizer.update"]["count"] else: update_total = update["SACTrainer._update_policy"]["total"] update_count = update["SACTrainer._update_policy"]["count"] diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py index 41f2c3d87a..8f82751d3f 100644 --- a/ml-agents/mlagents/trainers/cli_utils.py +++ b/ml-agents/mlagents/trainers/cli_utils.py @@ -168,6 +168,13 @@ def _create_parser() -> argparse.ArgumentParser: action=DetectDefaultStoreTrue, help="Forces training using CPU only", ) + argparser.add_argument( + "--torch", + default=False, + action=DetectDefaultStoreTrue, + help="(Experimental) Use the PyTorch framework instead of TensorFlow. Install PyTorch " + "before using this option", + ) eng_conf = argparser.add_argument_group(title="Engine Configuration") eng_conf.add_argument( diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_tf.py b/ml-agents/mlagents/trainers/ppo/optimizer_tf.py index 51116f2302..323ca640fe 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_tf.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_tf.py @@ -9,7 +9,7 @@ from mlagents.trainers.settings import TrainerSettings, PPOSettings -class TFPPOOptimizer(TFOptimizer): +class PPOOptimizer(TFOptimizer): def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index a06396a617..b0ffae03c8 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -10,19 +10,25 @@ from mlagents_envs.logging_util import get_logger from mlagents_envs.base_env import BehaviorSpec from mlagents.trainers.trainer.rl_trainer import RLTrainer -from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.policy import Policy from mlagents.trainers.policy.tf_policy import TFPolicy -from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer -from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer +from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer from mlagents.trainers.trajectory import Trajectory from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers from mlagents.trainers.settings import ( TrainerSettings, PPOSettings, TestingConfiguration, + FrameworkType, ) +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy + from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer +except ModuleNotFoundError: + TorchPolicy = None # type: ignore + TorchPPOOptimizer = None # type: ignore + logger = get_logger(__name__) @@ -58,7 +64,6 @@ def __init__( ) self.load = load self.seed = seed - self.framework = "torch" if TestingConfiguration.use_torch else "tf" if TestingConfiguration.max_steps > 0: self.trainer_settings.max_steps = TestingConfiguration.max_steps self.policy: Policy = None # type: ignore @@ -254,12 +259,12 @@ def add_policy( ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy - if self.framework == "torch": + if self.framework == FrameworkType.PYTORCH: self.optimizer = TorchPPOOptimizer( # type: ignore self.policy, self.trainer_settings # type: ignore ) # type: ignore else: - self.optimizer = TFPPOOptimizer( # type: ignore + self.optimizer = PPOOptimizer( # type: ignore self.policy, self.trainer_settings # type: ignore ) # type: ignore for _reward_signal in self.optimizer.reward_signals.keys(): diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index fea60e143d..13dd46ecc0 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -18,10 +18,14 @@ from mlagents.trainers.trainer.rl_trainer import RLTrainer from mlagents.trainers.trajectory import Trajectory, SplitObservations from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers -from mlagents.trainers.policy.torch_policy import TorchPolicy -from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer -from mlagents.trainers.settings import TrainerSettings, SACSettings +from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy + from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer +except ModuleNotFoundError: + TorchPolicy = None # type: ignore + TorchSACOptimizer = None # type: ignore logger = get_logger(__name__) @@ -353,7 +357,7 @@ def add_policy( ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy - if self.framework == "torch": + if self.framework == FrameworkType.PYTORCH: self.optimizer = TorchSACOptimizer( # type: ignore self.policy, self.trainer_settings # type: ignore ) # type: ignore diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py index fc6f341970..bb170349ec 100644 --- a/ml-agents/mlagents/trainers/settings.py +++ b/ml-agents/mlagents/trainers/settings.py @@ -524,6 +524,11 @@ def to_settings(self) -> type: return _mapping[self] +class FrameworkType(Enum): + TENSORFLOW: str = "tensorflow" + PYTORCH: str = "pytorch" + + @attr.s(auto_attribs=True) class TrainerSettings(ExportableSettings): trainer_type: TrainerType = TrainerType.PPO @@ -546,6 +551,7 @@ def _set_default_hyperparameters(self): threaded: bool = True self_play: Optional[SelfPlaySettings] = None behavioral_cloning: Optional[BehavioralCloningSettings] = None + framework: FrameworkType = FrameworkType.TENSORFLOW cattr.register_structure_hook( Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure @@ -713,7 +719,13 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions": configured_dict["engine_settings"][key] = val else: # Base options configured_dict[key] = val - return RunOptions.from_dict(configured_dict) + + # Apply --torch retroactively + final_runoptions = RunOptions.from_dict(configured_dict) + if "torch" in DetectDefault.non_default_args: + for trainer_set in final_runoptions.behaviors.values(): + trainer_set.framework = FrameworkType.PYTORCH + return final_runoptions @staticmethod def from_dict(options_dict: Dict[str, Any]) -> "RunOptions": diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py index 5ea9f11bd5..11496ad3bd 100644 --- a/ml-agents/mlagents/trainers/tests/test_ppo.py +++ b/ml-agents/mlagents/trainers/tests/test_ppo.py @@ -8,7 +8,7 @@ from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards -from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer +from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.agent_processor import AgentManagerQueue from mlagents.trainers.tests import mock_brain as mb @@ -52,7 +52,7 @@ def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visu policy = TFPolicy( 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False ) - optimizer = TFPPOOptimizer(policy, trainer_settings) + optimizer = PPOOptimizer(policy, trainer_settings) return optimizer diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py index 5ccbfe8836..44daf6713e 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py @@ -4,7 +4,7 @@ import mlagents.trainers.tests.mock_brain as mb from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.sac.optimizer import SACOptimizer -from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer +from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG from mlagents.trainers.settings import ( GAILSettings, @@ -75,7 +75,7 @@ def create_optimizer_mock( if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: - optimizer = TFPPOOptimizer(policy, trainer_settings) + optimizer = PPOOptimizer(policy, trainer_settings) return optimizer diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py index 9736df2fd8..6aac069310 100644 --- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py +++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py @@ -25,7 +25,10 @@ def _update_policy(self): def add_policy(self, mock_behavior_id, mock_policy): self.policies[mock_behavior_id] = mock_policy - def create_policy(self): + def create_tf_policy(self): + return mock.Mock() + + def create_torch_policy(self): return mock.Mock() def _process_trajectory(self, trajectory): diff --git a/ml-agents/mlagents/trainers/tests/torch/test_networks.py b/ml-agents/mlagents/trainers/tests/torch/test_networks.py index b6b93698f1..ff5209b676 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py @@ -40,6 +40,7 @@ def test_networkbody_vector(): def test_networkbody_lstm(): + torch.manual_seed(0) obs_size = 4 seq_len = 16 network_settings = NetworkSettings( @@ -64,6 +65,7 @@ def test_networkbody_lstm(): def test_networkbody_visual(): + torch.manual_seed(0) vec_obs_size = 4 obs_size = (84, 84, 3) network_settings = NetworkSettings() @@ -89,6 +91,7 @@ def test_networkbody_visual(): def test_valuenetwork(): + torch.manual_seed(0) obs_size = 4 num_outputs = 2 network_settings = NetworkSettings() diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index fdd90e11b3..9768a03ebf 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -19,13 +19,18 @@ from mlagents_envs.timers import hierarchical_timer from mlagents_envs.base_env import BehaviorSpec from mlagents.trainers.policy.policy import Policy -from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers from mlagents.trainers.agent_processor import AgentManagerQueue from mlagents.trainers.trajectory import Trajectory -from mlagents.trainers.settings import TestingConfiguration +from mlagents.trainers.settings import TestingConfiguration, FrameworkType from mlagents.trainers.stats import StatsPropertyType +from mlagents.trainers.exception import UnityTrainerException + +try: + from mlagents.trainers.policy.torch_policy import TorchPolicy +except ModuleNotFoundError: + TorchPolicy = None # type: ignore RewardSignalResults = Dict[str, RewardSignalResult] @@ -50,7 +55,8 @@ def __init__(self, *args, **kwargs): self._stats_reporter.add_property( StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() ) - self.framework = "torch" if TestingConfiguration.use_torch else "tf" + self.framework = self.trainer_settings.framework + logger.debug(f"Using framework {self.framework.value}") if TestingConfiguration.max_steps > 0: self.trainer_settings.max_steps = TestingConfiguration.max_steps self._next_save_step = 0 @@ -99,7 +105,11 @@ def _is_ready_update(self): def create_policy( self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec ) -> Policy: - if self.framework == "torch": + if self.framework == FrameworkType.PYTORCH and TorchPolicy is None: + raise UnityTrainerException( + "To use the experimental PyTorch backend, install the PyTorch Python package first." + ) + elif self.framework == FrameworkType.PYTORCH: return self.create_torch_policy(parsed_behavior_id, behavior_spec) else: return self.create_tf_policy(parsed_behavior_id, behavior_spec) @@ -165,6 +175,9 @@ def save_model(self) -> None: logger.warning( "Trainer has multiple policies, but default behavior only saves the first." ) + elif n_policies == 0: + logger.warning("Trainer has no policies, not saving anything.") + return policy = list(self.policies.values())[0] settings = SerializationSettings(policy.model_path, self.brain_name) model_checkpoint = self._checkpoint() diff --git a/test_requirements.txt b/test_requirements.txt index 08193c7ac9..967c4f635e 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -3,5 +3,8 @@ pytest>4.0.0,<6.0.0 pytest-cov==2.6.1 pytest-xdist +# PyTorch tests are here for the time being, before they are used in the codebase. +torch>=1.5.0 + # onnx doesn't currently have a wheel for 3.8 tf2onnx>=1.5.5;python_version<'3.8'