Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,10 @@ def add_rewards_outputs(
for name, reward_result in rewards_out.reward_signals.items():
# 0 because we use the scaled reward to train the agent
self.training_buffer[agent_id]["{}_rewards".format(name)].append(
reward_result.scaled_reward[agent_idx]
reward_result.scaled_reward[agent_next_idx]
)
self.training_buffer[agent_id]["{}_value_estimates".format(name)].append(
values[name][agent_next_idx][0]
values[name][agent_idx][0]
)

def is_ready_update(self):
Expand Down
31 changes: 31 additions & 0 deletions ml-agents/mlagents/trainers/tests/test_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.rl_trainer import AllRewardsOutput
from mlagents.trainers.components.reward_signals import RewardSignalResult
from mlagents.envs import UnityEnvironment, BrainParameters
from mlagents.envs.mock_communicator import MockCommunicator

Expand Down Expand Up @@ -355,5 +357,34 @@ def test_trainer_increment_step():
assert trainer.step == 10


def test_add_rewards_output(dummy_config):
brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
rewardsout = AllRewardsOutput(
reward_signals={
"extrinsic": RewardSignalResult(
scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])
)
},
environment=np.array([1.0, 1.0]),
)
values = {"extrinsic": np.array([[2.0]])}
agent_id = "123"
idx = 0
# make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
next_idx = 1
trainer.add_rewards_outputs(
rewardsout,
values=values,
agent_id=agent_id,
agent_idx=idx,
agent_next_idx=next_idx,
)
assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0


if __name__ == "__main__":
pytest.main()