diff --git a/ml-agents/mlagents/trainers/components/bc/module.py b/ml-agents/mlagents/trainers/components/bc/module.py index 7b1592e2a7..798d2f65c5 100644 --- a/ml-agents/mlagents/trainers/components/bc/module.py +++ b/ml-agents/mlagents/trainers/components/bc/module.py @@ -133,7 +133,7 @@ def _update_batch( else: feed_dict[self.policy.model.action_masks] = np.ones( ( - self.n_sequences, + self.n_sequences * self.policy.sequence_length, sum(self.policy.model.brain.vector_action_space_size), ) ) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py index 38efa886d7..0283ea9e38 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py @@ -1,8 +1,7 @@ from typing import Any, Dict, List import numpy as np -from mlagents.envs.brain import BrainInfo - import tensorflow as tf +from mlagents.envs.brain import BrainInfo from mlagents.trainers.buffer import Buffer from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult @@ -56,25 +55,40 @@ def evaluate( :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ if len(current_info.agents) == 0: - return [] + return RewardSignalResult([], []) + mini_batch: Dict[str, np.array] = {} + # Construct the batch and use evaluate_batch + mini_batch["actions"] = next_info.previous_vector_actions + mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1]) + for i in range(len(current_info.visual_observations)): + mini_batch["visual_obs%d" % i] = current_info.visual_observations[i] + mini_batch["next_visual_obs%d" % i] = next_info.visual_observations[i] + if self.policy.use_vec_obs: + mini_batch["vector_obs"] = current_info.vector_observations + mini_batch["next_vector_in"] = next_info.vector_observations - feed_dict = { - self.policy.model.batch_size: len(next_info.vector_observations), - self.policy.model.sequence_length: 1, + result = self.evaluate_batch(mini_batch) + return result + + def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: + feed_dict: Dict[tf.Tensor, Any] = { + self.policy.model.batch_size: len(mini_batch["actions"]), + self.policy.model.sequence_length: self.policy.sequence_length, } - feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) + if self.policy.use_vec_obs: + feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"] + feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"] + if self.policy.model.vis_obs_size > 0: + for i in range(len(self.policy.model.visual_in)): + _obs = mini_batch["visual_obs%d" % i] + _next_obs = mini_batch["next_visual_obs%d" % i] + feed_dict[self.policy.model.visual_in[i]] = _obs + feed_dict[self.model.next_visual_in[i]] = _next_obs + if self.policy.use_continuous_act: - feed_dict[ - self.policy.model.selected_actions - ] = next_info.previous_vector_actions + feed_dict[self.policy.model.selected_actions] = mini_batch["actions"] else: - feed_dict[ - self.policy.model.action_holder - ] = next_info.previous_vector_actions - for i in range(self.policy.model.vis_obs_size): - feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] - if self.policy.use_vec_obs: - feed_dict[self.model.next_vector_in] = next_info.vector_observations + feed_dict[self.policy.model.action_holder] = mini_batch["actions"] unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) @@ -110,8 +124,6 @@ def prepare_update( policy_model.batch_size: num_sequences, policy_model.sequence_length: self.policy.sequence_length, policy_model.mask_input: mini_batch["masks"], - policy_model.advantage: mini_batch["advantages"], - policy_model.all_old_log_probs: mini_batch["action_probs"], } if self.policy.use_continuous_act: feed_dict[policy_model.output_pre] = mini_batch["actions_pre"] @@ -121,12 +133,10 @@ def prepare_update( feed_dict[policy_model.vector_in] = mini_batch["vector_obs"] feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"] if policy_model.vis_obs_size > 0: - for i, _ in enumerate(policy_model.visual_in): - feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i] - for i, _ in enumerate(policy_model.visual_in): - feed_dict[self.model.next_visual_in[i]] = mini_batch[ - "next_visual_obs%d" % i - ] + for i, vis_in in enumerate(policy_model.visual_in): + feed_dict[vis_in] = mini_batch["visual_obs%d" % i] + for i, next_vis_in in enumerate(self.model.next_visual_in): + feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i] self.has_updated = True return feed_dict diff --git a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py index e307d3d466..4b59c5d5f5 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py @@ -36,11 +36,6 @@ def check_config( param_keys = ["strength", "gamma"] super().check_config(config_dict, param_keys) - def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: - env_rews = mini_batch["environment_rewards"] - - return RewardSignalResult(self.strength * env_rews, env_rews) - def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: @@ -53,3 +48,7 @@ def evaluate( unscaled_reward = np.array(next_info.rewards) scaled_reward = self.strength * unscaled_reward return RewardSignalResult(scaled_reward, unscaled_reward) + + def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: + env_rews = np.array(mini_batch["environment_rewards"]) + return RewardSignalResult(self.strength * env_rews, env_rews) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py index 9721fd08b8..a64b3e04f2 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py @@ -224,6 +224,8 @@ def create_network(self) -> None: self.done_policy, reuse=True, ) + self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) + self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) diff --git a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py index bb8d920e0d..62f0ab2870 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py @@ -52,8 +52,8 @@ def __init__( self.update_dict: Dict[str, tf.Tensor] = { "gail_loss": self.model.loss, "gail_update_batch": self.model.update_batch, - "gail_policy_estimate": self.model.policy_estimate, - "gail_expert_estimate": self.model.expert_estimate, + "gail_policy_estimate": self.model.mean_policy_estimate, + "gail_expert_estimate": self.model.mean_expert_estimate, } if self.model.use_vail: self.update_dict["kl_loss"] = self.model.kl_loss @@ -62,31 +62,51 @@ def __init__( self.update_dict["z_mean_policy"] = self.model.z_mean_policy self.update_dict["beta_update"] = self.model.update_beta - self.stats_name_to_update_name = {"Losses/GAIL Loss": "gail_loss"} + self.stats_name_to_update_name = { + "Losses/GAIL Loss": "gail_loss", + "Policy/GAIL Policy Estimate": "gail_policy_estimate", + "Policy/GAIL Expert Estimate": "gail_expert_estimate", + } def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: if len(current_info.agents) == 0: - return [] + return RewardSignalResult([], []) + mini_batch: Dict[str, np.array] = {} + # Construct the batch + mini_batch["actions"] = next_info.previous_vector_actions + mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1]) + for i, obs in enumerate(current_info.visual_observations): + mini_batch["visual_obs%d" % i] = obs + if self.policy.use_vec_obs: + mini_batch["vector_obs"] = current_info.vector_observations + + result = self.evaluate_batch(mini_batch) + return result + def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: feed_dict: Dict[tf.Tensor, Any] = { - self.policy.model.batch_size: len(next_info.vector_observations), - self.policy.model.sequence_length: 1, + self.policy.model.batch_size: len(mini_batch["actions"]), + self.policy.model.sequence_length: self.policy.sequence_length, } if self.model.use_vail: feed_dict[self.model.use_noise] = [0] - feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) - feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1]) + if self.policy.use_vec_obs: + feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"] + if self.policy.model.vis_obs_size > 0: + for i in range(len(self.policy.model.visual_in)): + _obs = mini_batch["visual_obs%d" % i] + feed_dict[self.policy.model.visual_in[i]] = _obs + if self.policy.use_continuous_act: - feed_dict[ - self.policy.model.selected_actions - ] = next_info.previous_vector_actions + feed_dict[self.policy.model.selected_actions] = mini_batch["actions"] else: - feed_dict[ - self.policy.model.action_holder - ] = next_info.previous_vector_actions + feed_dict[self.policy.model.action_holder] = mini_batch["actions"] + feed_dict[self.model.done_policy_holder] = np.array( + mini_batch["done"] + ).flatten() unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) @@ -123,11 +143,10 @@ def prepare_update( # If num_sequences is less, we need to shorten the input batch. for key, element in mini_batch_policy.items(): mini_batch_policy[key] = element[:max_num_experiences] - # Get demo buffer - self.demonstration_buffer.update_buffer.shuffle(1) - # TODO: Replace with SAC sample method - mini_batch_demo = self.demonstration_buffer.update_buffer.make_mini_batch( - 0, len(mini_batch_policy["actions"]) + + # Get batch from demo buffer + mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch( + len(mini_batch_policy["actions"]), 1 ) feed_dict: Dict[tf.Tensor, Any] = { diff --git a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py index 26b69e6d0c..828cb60739 100644 --- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py +++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py @@ -63,6 +63,21 @@ def evaluate( np.zeros(len(current_info.agents)), ) + def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: + """ + Evaluates the reward for the data present in the Dict mini_batch. Note the distiction between + evaluate(), which takes in two BrainInfos. This reflects the different data formats (i.e. from the Buffer + vs. before being placed into the Buffer. Use this when evaluating a reward function drawn straight from a + Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator + """ + mini_batch_len = len(next(iter(mini_batch.values()))) + return RewardSignalResult( + self.strength * np.zeros(mini_batch_len), np.zeros(mini_batch_len) + ) + def prepare_update( self, policy_model: LearningModel, diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index e89cc0f459..596c566480 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -13,7 +13,7 @@ from mlagents.trainers.ppo.policy import PPOPolicy from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices from mlagents.trainers.trainer import UnityTrainerException -from mlagents.trainers.rl_trainer import RLTrainer +from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput from mlagents.trainers.components.reward_signals import RewardSignalResult from mlagents.envs.action_info import ActionInfoOutputs @@ -193,8 +193,8 @@ def add_policy_outputs( def add_rewards_outputs( self, - value: Dict[str, Any], - rewards_dict: Dict[str, RewardSignalResult], + rewards_out: AllRewardsOutput, + values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, @@ -202,27 +202,15 @@ def add_rewards_outputs( """ Takes the value output of the last action and store it into the training buffer. """ - for name, reward_result in rewards_dict.items(): + for name, reward_result in rewards_out.reward_signals.items(): # 0 because we use the scaled reward to train the agent self.training_buffer[agent_id]["{}_rewards".format(name)].append( reward_result.scaled_reward[agent_idx] ) self.training_buffer[agent_id]["{}_value_estimates".format(name)].append( - value[name][agent_next_idx][0] + values[name][agent_next_idx][0] ) - def end_episode(self): - """ - A signal that the Episode has ended. The buffer must be reset. - Get only called when the academy resets. - """ - self.training_buffer.reset_local_buffers() - for agent_id in self.episode_steps: - self.episode_steps[agent_id] = 0 - for rewards in self.collected_rewards.values(): - for agent_id in rewards: - rewards[agent_id] = 0 - def is_ready_update(self): """ Returns whether or not the trainer has enough elements to run update model diff --git a/ml-agents/mlagents/trainers/rl_trainer.py b/ml-agents/mlagents/trainers/rl_trainer.py index f62aab9a86..6316b86e73 100644 --- a/ml-agents/mlagents/trainers/rl_trainer.py +++ b/ml-agents/mlagents/trainers/rl_trainer.py @@ -1,6 +1,6 @@ # # Unity ML-Agents Toolkit import logging -from typing import Dict, List, Deque, Any +from typing import Dict, List, Deque, Any, Optional, NamedTuple import os import tensorflow as tf import numpy as np @@ -10,10 +10,23 @@ from mlagents.trainers.buffer import Buffer from mlagents.trainers.tf_policy import Policy from mlagents.trainers.trainer import Trainer, UnityTrainerException +from mlagents.trainers.components.reward_signals.reward_signal import RewardSignalResult from mlagents.envs import BrainParameters LOGGER = logging.getLogger("mlagents.trainers") +RewardSignalResults = Dict[str, RewardSignalResult] + + +class AllRewardsOutput(NamedTuple): + """ + This class stores all of the outputs of the reward signals, + as well as the raw reward from the environment. + """ + + reward_signals: RewardSignalResults + environment: np.ndarray + class RLTrainer(Trainer): """ @@ -141,9 +154,16 @@ def add_experiences( else: curr_to_use = curr_info - tmp_rewards_dict = {} + # Evaluate and store the reward signals + tmp_reward_signal_outs = {} for name, signal in self.policy.reward_signals.items(): - tmp_rewards_dict[name] = signal.evaluate(curr_to_use, next_info) + tmp_reward_signal_outs[name] = signal.evaluate(curr_to_use, next_info) + # Store the environment reward + tmp_environment = np.array(next_info.rewards) + + rewards_out = AllRewardsOutput( + reward_signals=tmp_reward_signal_outs, environment=tmp_environment + ) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info @@ -193,9 +213,10 @@ def add_experiences( ) values = stored_take_action_outputs["value_heads"] + # Add the value outputs if needed self.add_rewards_outputs( - values, tmp_rewards_dict, agent_id, idx, next_idx + rewards_out, values, agent_id, idx, next_idx ) for name, rewards in self.collected_rewards.items(): @@ -203,18 +224,30 @@ def add_experiences( rewards[agent_id] = 0 if name == "environment": # Report the reward from the environment - rewards[agent_id] += np.array(next_info.rewards)[next_idx] + rewards[agent_id] += rewards_out.environment[next_idx] else: # Report the reward signals - rewards[agent_id] += tmp_rewards_dict[name].scaled_reward[ - next_idx - ] + rewards[agent_id] += rewards_out.reward_signals[ + name + ].scaled_reward[next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() + def end_episode(self) -> None: + """ + A signal that the Episode has ended. The buffer must be reset. + Get only called when the academy resets. + """ + self.training_buffer.reset_local_buffers() + for agent_id in self.episode_steps: + self.episode_steps[agent_id] = 0 + for rewards in self.collected_rewards.values(): + for agent_id in rewards: + rewards[agent_id] = 0 + def add_policy_outputs( self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int ) -> None: @@ -232,8 +265,8 @@ def add_policy_outputs( def add_rewards_outputs( self, - value: Dict[str, Any], - rewards_dict: Dict[str, float], + rewards_out: AllRewardsOutput, + values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py index 5afe7fa6d2..be36fe253c 100644 --- a/ml-agents/mlagents/trainers/tests/mock_brain.py +++ b/ml-agents/mlagents/trainers/tests/mock_brain.py @@ -107,7 +107,7 @@ def simulate_rollout(env, policy, buffer_init_samples): return buffer -def create_buffer(brain_infos, brain_params, sequence_length): +def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = Buffer() # Make a buffer for idx, experience in enumerate(brain_infos): @@ -144,13 +144,64 @@ def create_buffer(brain_infos, brain_params, sequence_length): buffer[0]["random_normal_epsilon"].append( np.ones(buffer[0]["actions"][0].shape) ) - buffer[0]["action_mask"].append(np.ones(buffer[0]["actions"][0].shape)) - buffer[0]["memory"].append(np.ones(8)) + buffer[0]["action_mask"].append( + np.ones(np.sum(brain_params.vector_action_space_size)) + ) + buffer[0]["memory"].append(np.ones(memory_size)) buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length) return buffer +def setup_mock_env_and_brains( + mock_env, + use_discrete, + use_visual, + num_agents=12, + discrete_action_space=[3, 3, 3, 2], + vector_action_space=[2], + vector_obs_space=8, +): + if not use_visual: + mock_brain = create_mock_brainparams( + vector_action_space_type="discrete" if use_discrete else "continuous", + vector_action_space_size=discrete_action_space + if use_discrete + else vector_action_space, + vector_observation_space_size=vector_obs_space, + ) + mock_braininfo = create_mock_braininfo( + num_agents=num_agents, + num_vector_observations=vector_obs_space, + num_vector_acts=sum( + discrete_action_space if use_discrete else vector_action_space + ), + discrete=use_discrete, + num_discrete_branches=len(discrete_action_space), + ) + else: + mock_brain = create_mock_brainparams( + vector_action_space_type="discrete" if use_discrete else "continuous", + vector_action_space_size=discrete_action_space + if use_discrete + else vector_action_space, + vector_observation_space_size=0, + number_visual_observations=1, + ) + mock_braininfo = create_mock_braininfo( + num_agents=num_agents, + num_vis_observations=1, + num_vector_acts=sum( + discrete_action_space if use_discrete else vector_action_space + ), + discrete=use_discrete, + num_discrete_branches=len(discrete_action_space), + ) + setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) + env = mock_env() + return env, mock_brain, mock_braininfo + + def create_mock_3dball_brain(): mock_brain = create_mock_brainparams( vector_action_space_type="continuous", diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py index c1bc97bef9..d8af771966 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py @@ -73,44 +73,15 @@ def curiosity_dummy_config(): def create_ppo_policy_mock( mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual ): - - if not use_visual: - mock_brain = mb.create_mock_brainparams( - vector_action_space_type="discrete" if use_discrete else "continuous", - vector_action_space_size=DISCRETE_ACTION_SPACE - if use_discrete - else VECTOR_ACTION_SPACE, - vector_observation_space_size=VECTOR_OBS_SPACE, - ) - mock_braininfo = mb.create_mock_braininfo( - num_agents=NUM_AGENTS, - num_vector_observations=VECTOR_OBS_SPACE, - num_vector_acts=sum( - DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE - ), - discrete=use_discrete, - num_discrete_branches=len(DISCRETE_ACTION_SPACE), - ) - else: - mock_brain = mb.create_mock_brainparams( - vector_action_space_type="discrete" if use_discrete else "continuous", - vector_action_space_size=DISCRETE_ACTION_SPACE - if use_discrete - else VECTOR_ACTION_SPACE, - vector_observation_space_size=0, - number_visual_observations=1, - ) - mock_braininfo = mb.create_mock_braininfo( - num_agents=NUM_AGENTS, - num_vis_observations=1, - num_vector_acts=sum( - DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE - ), - discrete=use_discrete, - num_discrete_branches=len(DISCRETE_ACTION_SPACE), - ) - mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) - env = mock_env() + env, mock_brain, _ = mb.setup_mock_env_and_brains( + mock_env, + use_discrete, + use_visual, + num_agents=NUM_AGENTS, + vector_action_space=VECTOR_ACTION_SPACE, + vector_obs_space=VECTOR_OBS_SPACE, + discrete_action_space=DISCRETE_ACTION_SPACE, + ) trainer_parameters = dummy_config model_path = env.brain_names[0] diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py index be23670111..617337350e 100644 --- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py +++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py @@ -79,3 +79,12 @@ def test_rl_trainer(add_policy_outputs, add_rewards_outputs): # assert construct_curr_info worked properly assert len(brain_info.agents) == 1 + + # Test end episode + trainer.end_episode() + for agent_id in trainer.episode_steps: + assert trainer.episode_steps[agent_id] == 0 + assert len(trainer.training_buffer[agent_id]["action"]) == 0 + for rewards in trainer.collected_rewards.values(): + for agent_id in rewards: + assert rewards[agent_id] == 0