From 27c85b6094d5fa1bc44d4602c89192f490f976a1 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Fri, 16 Aug 2019 11:51:46 -0700 Subject: [PATCH] Fix bug in add_rewards_output and add test --- ml-agents/mlagents/trainers/ppo/trainer.py | 4 +-- ml-agents/mlagents/trainers/tests/test_ppo.py | 31 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index 596c566480..3e06f86396 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -205,10 +205,10 @@ def add_rewards_outputs( for name, reward_result in rewards_out.reward_signals.items(): # 0 because we use the scaled reward to train the agent self.training_buffer[agent_id]["{}_rewards".format(name)].append( - reward_result.scaled_reward[agent_idx] + reward_result.scaled_reward[agent_next_idx] ) self.training_buffer[agent_id]["{}_value_estimates".format(name)].append( - values[name][agent_next_idx][0] + values[name][agent_idx][0] ) def is_ready_update(self): diff --git a/ml-agents/mlagents/trainers/tests/test_ppo.py b/ml-agents/mlagents/trainers/tests/test_ppo.py index ec8c1c759d..ba18c27cf5 100644 --- a/ml-agents/mlagents/trainers/tests/test_ppo.py +++ b/ml-agents/mlagents/trainers/tests/test_ppo.py @@ -8,6 +8,8 @@ from mlagents.trainers.ppo.models import PPOModel from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards from mlagents.trainers.ppo.policy import PPOPolicy +from mlagents.trainers.rl_trainer import AllRewardsOutput +from mlagents.trainers.components.reward_signals import RewardSignalResult from mlagents.envs import UnityEnvironment, BrainParameters from mlagents.envs.mock_communicator import MockCommunicator @@ -355,5 +357,34 @@ def test_trainer_increment_step(): assert trainer.step == 10 +def test_add_rewards_output(dummy_config): + brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) + dummy_config["summary_path"] = "./summaries/test_trainer_summary" + dummy_config["model_path"] = "./models/test_trainer_models/TestModel" + trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) + rewardsout = AllRewardsOutput( + reward_signals={ + "extrinsic": RewardSignalResult( + scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0]) + ) + }, + environment=np.array([1.0, 1.0]), + ) + values = {"extrinsic": np.array([[2.0]])} + agent_id = "123" + idx = 0 + # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. + next_idx = 1 + trainer.add_rewards_outputs( + rewardsout, + values=values, + agent_id=agent_id, + agent_idx=idx, + agent_next_idx=next_idx, + ) + assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0 + assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0 + + if __name__ == "__main__": pytest.main()