From f5c0b2a0c36233e5d16dddf25fd918ec5a85477e Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Tue, 28 Jul 2020 11:00:02 -0700 Subject: [PATCH 01/10] Added Reward Providers for Torch --- .../trainers/reward_providers/__init__.py | 15 + .../reward_providers/base_reward_provider.py | 70 +++++ .../curiosity_reward_provider.py | 260 ++++++++++++++++++ .../extrinsic_reward_provider.py | 12 + .../reward_providers/gail_reward_provider.py | 226 +++++++++++++++ .../reward_provider_factory.py | 39 +++ .../test_reward_providers/test_curiosity.py | 106 +++++++ .../test_reward_providers/test_extrinsic.py | 54 ++++ .../tests/test_reward_providers/test_gail.py | 129 +++++++++ .../tests/test_reward_providers/utils.py | 31 +++ 10 files changed, 942 insertions(+) create mode 100644 ml-agents/mlagents/trainers/reward_providers/__init__.py create mode 100644 ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py create mode 100644 ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py create mode 100644 ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py create mode 100644 ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py create mode 100644 ml-agents/mlagents/trainers/reward_providers/reward_provider_factory.py create mode 100644 ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py create mode 100644 ml-agents/mlagents/trainers/tests/test_reward_providers/test_extrinsic.py create mode 100644 ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py create mode 100644 ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py diff --git a/ml-agents/mlagents/trainers/reward_providers/__init__.py b/ml-agents/mlagents/trainers/reward_providers/__init__.py new file mode 100644 index 0000000000..06e0d07eeb --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/__init__.py @@ -0,0 +1,15 @@ +from mlagents.trainers.reward_providers.base_reward_provider import ( # noqa F401 + BaseRewardProvider, +) +from mlagents.trainers.reward_providers.extrinsic_reward_provider import ( # noqa F401 + ExtrinsicRewardProvider, +) +from mlagents.trainers.reward_providers.curiosity_reward_provider import ( # noqa F401 + CuriosityRewardProvider, +) +from mlagents.trainers.reward_providers.gail_reward_provider import ( # noqa F401 + GAILRewardProvider, +) +from mlagents.trainers.reward_providers.reward_provider_factory import ( # noqa F401 + create_reward_provider, +) diff --git a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py new file mode 100644 index 0000000000..83e3afb0c1 --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py @@ -0,0 +1,70 @@ +import numpy as np +from abc import ABC, abstractmethod + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.settings import RewardSignalSettings +from mlagents_envs.base_env import BehaviorSpec + + +class BaseRewardProvider(ABC): + def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: + self._policy_specs = specs + self._gamma = settings.gamma + self._strength = settings.strength + self._ignore_done = False + + @property + def gamma(self) -> float: + """ + The discount factor for the reward signal + """ + return self._gamma + + @property + def strength(self) -> float: + """ + The strength multiplier of the reward provider + """ + return self._strength + + @property + def name(self) -> str: + """ + The name of the reward provider. Is used for reporting and identification + """ + class_name = self.__class__.__name__ + return class_name.replace("RewardProvider", "") + + @property + def ignore_done(self) -> bool: + """ + If true, when the agent is done, the rewards of the next episode must be + used to calculate the return of the current episode. + Is used to mitigate the positive bias in rewards with no natural end. + """ + return self._ignore_done + + @abstractmethod + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + """ + Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: a np.ndarray of rewards generated by the reward provider + """ + raise NotImplementedError( + "The reward provider's evaluate method has not been implemented " + ) + + @abstractmethod + def update(self, mini_batch: AgentBuffer) -> None: + """ + Update the reward for the data present in the Dict mini_batch. Use this when updating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + """ + raise NotImplementedError( + "The reward provider's update method has not been implemented " + ) diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py new file mode 100644 index 0000000000..e74c83bd77 --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -0,0 +1,260 @@ +from typing import List +import numpy as np +import torch + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider +from mlagents.trainers.settings import CuriositySettings + +from mlagents_envs.base_env import BehaviorSpec + + +def swish(x): + """ + TODO : MOVE SOMEWHERE BETTER + """ + return x * torch.sigmoid(x) + + +class Swish(torch.nn.Module): + """ + TODO : MOVE SOMEWHERE BETTER + """ + + def forward(self, data: torch.Tensor) -> torch.Tensor: + return torch.mul(data, torch.sigmoid(data)) + + +def actions_to_onehot( + discrete_actions: torch.Tensor, action_size: List[torch.Tensor] +) -> List[torch.Tensor]: + """ + Splits a discrete action Tensor (of integers) into its one hot representations. + Returns a list of Tensors (One Tensor per branch) + :param discrete_actions: A Tensor of discrete actions. + :param action_size: List of ints containing the number of possible actions for each branch. + :return: A list of one hot Tensors (one or each branch). + """ + onehot_branches = [ + torch.nn.functional.one_hot(_act.T, action_size[i]).float() + for i, _act in enumerate(discrete_actions.long().T) + ] + return onehot_branches + + +def break_into_branches( + concatenated_logits: torch.Tensor, action_size: List[torch.Tensor] +) -> List[torch.Tensor]: + """ + Takes a concatenated set of logits that represent multiple discrete action branches + and breaks it up into one Tensor per branch. + :param concatenated_logits: Tensor that represents the concatenated action branches + :param action_size: List of ints containing the number of possible actions for each branch. + :return: A List of Tensors containing one tensor per branch. + """ + action_idx = [0] + list(np.cumsum(action_size)) + branched_logits = [ + concatenated_logits[:, action_idx[i] : action_idx[i + 1]] + for i in range(len(action_size)) + ] + return branched_logits + + +def dynamic_partition( + data: torch.Tensor, partitions: torch.Tensor, num_partitions: int +) -> List[torch.Tensor]: + """ + Torch implementation of dynamic_partition : + https://www.tensorflow.org/api_docs/python/tf/dynamic_partition + Splits the data Tensor input into num_partitions Tensors according to the indices in + partitions. + :param data: The Tensor data that will be split into partitions. + :param partitions: An indices tensor that determines in which partition each element + of data will be in. + :param num_partitions: The number of partitions to output. Corresponds to the + maximum possible index in the partitions argument. + :return: A list of Tensor partitions (Their indices correspond to their partition index). + """ + res: List[torch.Tensor] = [] + for i in range(num_partitions): + res += [data[(partitions == i).nonzero().squeeze(1)]] + return res + + +class CuriosityRewardProvider(BaseRewardProvider): + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__(specs, settings) + self._network = CuriosityNetwork(specs, settings) + params = list(self._network.parameters()) + self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate) + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + rewards = self._network.compute_reward(mini_batch) + return rewards.detach().cpu().numpy() + + def update(self, mini_batch: AgentBuffer) -> None: + loss = self._network.compute_losses(mini_batch) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + +class CuriosityNetwork(torch.nn.Module): + EPSILON = 1e-10 + forward_loss_weight = 2.0 + inverse_loss_weight = 8.0 + + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__() + vec_obs_size = sum( + shape[0] for shape in specs.observation_shapes if len(shape) == 1 + ) + # vis_obs_shapes = [shape for shape in specs.observation_shapes if len(shape) == 3] + self._policy_specs = specs + + obs_size = vec_obs_size # Only vector for now + if obs_size > 0: + self.vec_encode_1 = torch.nn.Linear(obs_size, settings.encoding_size) + self.vec_encode_last = torch.nn.Linear( + settings.encoding_size, settings.encoding_size + ) + # TODO : The vector obs (Use networkBody from models_torch.py) + if self._policy_specs.is_action_continuous(): + self.inverse_model_action_predition = torch.nn.Linear( + 2 * settings.encoding_size, self._policy_specs.action_size + ) + self.forward_model_next_state_prediction = torch.nn.Linear( + settings.encoding_size + self._policy_specs.action_size, + settings.encoding_size, + ) + else: + self.inverse_model_action_predition = torch.nn.Linear( + 2 * settings.encoding_size, + sum(self._policy_specs.discrete_action_branches), + ) + self.forward_model_next_state_prediction = torch.nn.Linear( + settings.encoding_size + + sum(self._policy_specs.discrete_action_branches), + settings.encoding_size, + ) + + def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the current state embedding from a mini_batch. + """ + hidden = self.vec_encode_1( + torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float) + ) + # TODO do visual + hidden = swish(hidden) + hidden = self.vec_encode_last(hidden) + return hidden + + def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the next state embedding from a mini_batch. + """ + hidden = self.vec_encode_1( + torch.as_tensor(mini_batch["next_vector_in"], dtype=torch.float) + ) + # TODO do visual + hidden = swish(hidden) + hidden = self.vec_encode_last(hidden) + return hidden + + def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + In the continuous case, returns the predicted action. + In the discrete case, returns the logits. + """ + inverse_model_input = torch.cat( + (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1 + ) + inverse_model_input = swish(inverse_model_input) + hidden = self.inverse_model_action_predition(inverse_model_input) + if self._policy_specs.is_action_continuous(): + return hidden + else: + branches = break_into_branches( + hidden, self._policy_specs.discrete_action_branches + ) + branches = [torch.softmax(b, dim=1) for b in branches] + return torch.cat(branches, dim=1) + + def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Uses the current state embedding and the action of the mini_batch to predict + the next state embedding. + """ + if self._policy_specs.is_action_continuous(): + action = torch.as_tensor(mini_batch["actions"], dtype=torch.float) + else: + action = torch.cat( + actions_to_onehot( + torch.as_tensor(mini_batch["actions"], dtype=torch.long), + self._policy_specs.discrete_action_branches, + ), + dim=1, + ) + forward_model_input = torch.cat( + (self.get_current_state(mini_batch), action), dim=1 + ) + forward_model_input = swish(forward_model_input) + return self.forward_model_next_state_prediction(forward_model_input) + + def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the inverse loss for a mini_batch. Corresponds to the error on the + action prediction (given the current and next state). + """ + predicted_action = self.predict_action(mini_batch) + if self._policy_specs.is_action_continuous(): + sq_difference = ( + torch.as_tensor(mini_batch["actions"], dtype=torch.float) + - predicted_action + ) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + return torch.mean(sq_difference) + else: + true_action = torch.cat( + actions_to_onehot( + torch.as_tensor(mini_batch["actions"], dtype=torch.long), + self._policy_specs.discrete_action_branches, + ), + dim=1, + ) + cross_entropy = torch.sum( + -torch.log(predicted_action + self.EPSILON) * true_action, dim=1 + ) + return torch.mean( + dynamic_partition( + cross_entropy, + torch.as_tensor(mini_batch["action_mask"], dtype=torch.float), + 2, + )[1] + ) + + def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Calculates the curiosity reward for the mini_batch. Corresponds to the error + between the predicted and actual next state. + """ + predicted_next_state = self.predict_next_state(mini_batch) + sq_difference = (self.get_next_state(mini_batch) - predicted_next_state) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + return sq_difference + + def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the loss for the next state prediction + """ + return torch.mean(self.compute_reward(mini_batch)) + + def compute_losses(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the weighted sum of inverse and forward loss. + """ + return self.forward_loss_weight * self.compute_forward_loss( + mini_batch + ) + self.inverse_loss_weight * self.compute_inverse_loss(mini_batch) diff --git a/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py new file mode 100644 index 0000000000..d57cd2f893 --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py @@ -0,0 +1,12 @@ +import numpy as np + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider + + +class ExtrinsicRewardProvider(BaseRewardProvider): + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + return np.array(mini_batch["environment_rewards"], dtype=np.float32) + + def update(self, mini_batch: AgentBuffer) -> None: + pass diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py new file mode 100644 index 0000000000..9c114afe1f --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -0,0 +1,226 @@ +from typing import Optional +import numpy as np +import torch + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider +from mlagents.trainers.settings import GAILSettings +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.reward_providers.curiosity_reward_provider import ( + actions_to_onehot, + Swish, +) +from mlagents.trainers.demo_loader import demo_to_buffer + + +class GAILRewardProvider(BaseRewardProvider): + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__(specs, settings) + self._discriminator_network = DiscriminatorNetwork(specs, settings) + _, self._demo_buffer = demo_to_buffer( + settings.demo_path, 1, specs + ) # This is supposed to be the sequence length but we do not have access here + params = list(self._discriminator_network.parameters()) + self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate) + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + estimates, _ = self._discriminator_network.compute_estimate( + mini_batch, use_vail_noise=False + ) + return -torch.log( + 1.0 - estimates * (1.0 - self._discriminator_network.EPSILON) + ) + + def update(self, mini_batch: AgentBuffer) -> None: + expert_batch = self._demo_buffer.sample_mini_batch( + mini_batch.num_experiences, 1 + ) + loss = self._discriminator_network.compute_loss(mini_batch, expert_batch) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + +class DiscriminatorNetwork(torch.nn.Module): + gradient_penalty_weight = 10.0 + z_size = 128 + alpha = 0.0005 + mutual_information = 0.5 + EPSILON = 1e-7 + initial_beta = 1.0 + + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__() + self._policy_specs = specs + self._settings = settings + + # create the encoders for observations, actions and done + vec_obs_size = sum( + shape[0] for shape in specs.observation_shapes if len(shape) == 1 + ) + # vis_obs_shapes = [ + # shape for shape in specs.observation_shapes if len(shape) == 3 + # ] + + encoder_input_size = vec_obs_size # + visuals later and normalization + if settings.use_actions and specs.is_action_continuous(): + encoder_input_size += specs.action_size + 1 # + 1 is for done + if settings.use_actions and specs.is_action_discrete(): + encoder_input_size += ( + sum(specs.discrete_action_branches) + 1 + ) # + 1 is for done + + self.encoder = torch.nn.ModuleList( + [ + torch.nn.Linear(encoder_input_size, settings.encoding_size), + Swish(), # use swish + torch.nn.Linear(settings.encoding_size, settings.encoding_size), + ] + ) + estimator_input_size = settings.encoding_size + if settings.use_vail: + estimator_input_size = self.z_size + self.z_sigma = torch.nn.Parameter( + torch.ones((self.z_size), dtype=torch.float), requires_grad=True + ) + self.z_mu_layer = torch.nn.Linear(settings.encoding_size, self.z_size) + # self.mu_layer.weight.data Needs a variance scale initializer + self.beta = torch.tensor(self.initial_beta) + self.estimator = torch.nn.ModuleList( + [torch.nn.Linear(estimator_input_size, 1), torch.nn.Sigmoid()] + ) + + def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Creates the action Tensor. In continuous case, corresponds to the action. In + the discrete case, corresponds to the concatenation of one hot action Tensors. + """ + if self._policy_specs.is_action_continuous(): + return torch.as_tensor(mini_batch["actions"], dtype=torch.float) + if self._policy_specs.is_action_discrete(): + return torch.cat( + actions_to_onehot( + torch.as_tensor(mini_batch["actions"], dtype=torch.long), + self._policy_specs.discrete_action_branches, + ), + dim=1, + ) + + def get_obs_input(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Creates the observation input. + """ + # TODO : Visual observations + return torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float) + + def compute_estimate( + self, mini_batch: AgentBuffer, use_vail_noise: bool = False + ) -> torch.Tensor: + """ + Given a mini_batch, computes the estimate (How much the discriminator believes + the data was sampled from the demonstration data). + :param mini_batch: The AgentBuffer of data + :param use_vail_noise: Only when using VAIL : If true, will sample the code, if + false, will return the mean of the code. + """ + encoder_input = self.get_obs_input(mini_batch) + if self._settings.use_actions: + actions = self.get_action_input(mini_batch) + dones = torch.as_tensor(mini_batch["done"], dtype=torch.float) + encoder_input = torch.cat([encoder_input, actions, dones], dim=1) + for layer in self.encoder: + encoder_input = layer(encoder_input) + hidden = encoder_input + z_mu: Optional[torch.Tensor] = None + if self._settings.use_vail: + z_mu = self.z_mu_layer(hidden) + hidden = torch.normal(z_mu, self.z_sigma * use_vail_noise) + for layer in self.estimator: + hidden = layer(hidden) + estimate = hidden + return estimate, z_mu + + def compute_loss( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator. + """ + policy_estimate, policy_mu = self.compute_estimate( + policy_batch, use_vail_noise=True + ) + expert_estimate, expert_mu = self.compute_estimate( + expert_batch, use_vail_noise=True + ) + loss = -( + (expert_estimate * (1 - self.EPSILON)).log() + + (1.0 - policy_estimate * (1 - self.EPSILON)).log() + ).mean() + if self._settings.use_vail: + # KL divergence loss (encourage latent representation to be normal) + kl_loss = torch.mean( + -torch.sum( + 1 + + (self.z_sigma ** 2).log() + - 0.5 * expert_mu ** 2 + - 0.5 * policy_mu ** 2 + - (self.z_sigma ** 2), + dim=1, + ) + ) + vail_loss = self.beta * (kl_loss - self.mutual_information) + with torch.no_grad(): + self.beta = torch.max( + self.beta + self.alpha * (kl_loss - self.mutual_information), + torch.tensor(0.0), + ) + loss += vail_loss + if self.gradient_penalty_weight > 0.0: + loss += self.gradient_penalty_weight * self.compute_gradient_magnitude( + policy_batch, expert_batch + ) + return loss + + def compute_gradient_magnitude( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. + for off-policy. Compute gradients w.r.t randomly interpolated input. + """ + policy_obs = self.get_obs_input(policy_batch) + expert_obs = self.get_obs_input(expert_batch) + obs_epsilon = torch.rand(policy_obs.shape) + encoder_input = obs_epsilon * policy_obs + (1 - obs_epsilon) * expert_obs + if self._settings.use_actions: + policy_action = self.get_action_input(policy_batch) + expert_action = self.get_action_input(policy_batch) + action_epsilon = torch.rand(policy_action.shape) + policy_dones = torch.as_tensor(policy_batch["done"], dtype=torch.float) + expert_dones = torch.as_tensor(expert_batch["done"], dtype=torch.float) + dones_epsilon = torch.rand(policy_dones.shape) + encoder_input = torch.cat( + [ + encoder_input, + action_epsilon * policy_action + + (1 - action_epsilon) * expert_action, + dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones, + ], + dim=1, + ) + for layer in self.encoder: + encoder_input = layer(encoder_input) + hidden = encoder_input + if self._settings.use_vail: + use_vail_noise = True + z_mu = self.z_mu_layer(hidden) + hidden = torch.normal(z_mu, self.z_sigma * use_vail_noise) + for layer in self.estimator: + hidden = layer(hidden) + estimate = torch.mean(torch.sum(hidden, dim=1)) + gradient = torch.autograd.grad(estimate, encoder_input)[0] + # Norm's gradient could be NaN at 0. Use our own safe_norm + safe_norm = (torch.sum(gradient ** 2, dim=1) + self.EPSILON).sqrt() + gradient_mag = torch.mean((safe_norm - 1) ** 2) + return gradient_mag diff --git a/ml-agents/mlagents/trainers/reward_providers/reward_provider_factory.py b/ml-agents/mlagents/trainers/reward_providers/reward_provider_factory.py new file mode 100644 index 0000000000..6a74c8cc9d --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/reward_provider_factory.py @@ -0,0 +1,39 @@ +from typing import Dict, Type +from mlagents.trainers.exception import UnityTrainerException + +from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType + +from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider +from mlagents.trainers.reward_providers.extrinsic_reward_provider import ( + ExtrinsicRewardProvider, +) +from mlagents.trainers.reward_providers.curiosity_reward_provider import ( + CuriosityRewardProvider, +) +from mlagents.trainers.reward_providers.gail_reward_provider import GAILRewardProvider + +from mlagents_envs.base_env import BehaviorSpec + +NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = { + RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider, + RewardSignalType.CURIOSITY: CuriosityRewardProvider, + RewardSignalType.GAIL: GAILRewardProvider, +} + + +def create_reward_provider( + name: RewardSignalType, specs: BehaviorSpec, settings: RewardSignalSettings +) -> BaseRewardProvider: + """ + Creates a reward provider class based on the name and config entry provided as a dict. + :param name: The name of the reward signal + :param specs: The BehaviorSpecs of the policy + :param settings: The RewardSignalSettings for that reward signal + :return: The reward signal class instantiated + """ + rcls = NAME_TO_CLASS.get(name) + if not rcls: + raise UnityTrainerException(f"Unknown reward signal type {name}") + + class_inst = rcls(specs, settings) + return class_inst diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py new file mode 100644 index 0000000000..503fec3051 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest +import torch +from mlagents.trainers.reward_providers import ( + CuriosityRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import CuriositySettings, RewardSignalType +from mlagents.trainers.tests.test_reward_providers.utils import create_agent_buffer + +SEED = [42] + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + curiosity_settings = CuriositySettings(256, 0.01) + curiosity_settings.strength = 0.1 + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + assert curiosity_rp.strength == 0.1 + assert curiosity_rp.name == "Curiosity" + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + curiosity_settings = CuriositySettings(256, 0.01) + curiosity_rp = create_reward_provider( + RewardSignalType.CURIOSITY, behavior_spec, curiosity_settings + ) + assert curiosity_rp.name == "Curiosity" + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), + ], +) +def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(256, 0.01) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 1000) + reward_old = curiosity_rp.evaluate(buffer)[0] + for _ in range(10): + curiosity_rp.update(buffer) + reward_new = curiosity_rp.evaluate(buffer)[0] + assert reward_new < reward_old + reward_old = reward_new + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5)] +) +def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(256, 0.1) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 1000) + for _ in range(200): + curiosity_rp.update(buffer) + prediction = curiosity_rp._network.predict_action(buffer)[0].detach() + target = buffer["actions"][0] + error = float(torch.mean((prediction - target) ** 2)) + assert error < 0.001 + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), + ], +) +def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + curiosity_settings = CuriositySettings(256, 0.1) + curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) + buffer = create_agent_buffer(behavior_spec, 1000) + for _ in range(100): + curiosity_rp.update(buffer) + prediction = curiosity_rp._network.predict_next_state(buffer)[0] + target = curiosity_rp._network.get_next_state(buffer)[0] + error = float(torch.mean((prediction - target) ** 2).detach()) + assert error < 0.001 diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_extrinsic.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_extrinsic.py new file mode 100644 index 0000000000..813aad20a1 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_extrinsic.py @@ -0,0 +1,54 @@ +import pytest +from mlagents.trainers.reward_providers import ( + ExtrinsicRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType +from mlagents.trainers.tests.test_reward_providers.utils import create_agent_buffer + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + settings = RewardSignalSettings() + settings.gamma = 0.2 + extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) + assert extrinsic_rp.gamma == 0.2 + assert extrinsic_rp.name == "Extrinsic" + + +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + settings = RewardSignalSettings() + extrinsic_rp = create_reward_provider( + RewardSignalType.EXTRINSIC, behavior_spec, settings + ) + assert extrinsic_rp.name == "Extrinsic" + + +@pytest.mark.parametrize("reward", [2.0, 3.0, 4.0]) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + ], +) +def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None: + buffer = create_agent_buffer(behavior_spec, 1000, reward) + settings = RewardSignalSettings() + extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) + generated_rewards = extrinsic_rp.evaluate(buffer) + assert (generated_rewards == reward).all() diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py new file mode 100644 index 0000000000..ac6c473272 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py @@ -0,0 +1,129 @@ +from typing import Any +import numpy as np +import pytest +from unittest.mock import patch +import torch +import os +from mlagents.trainers.reward_providers import ( + GAILRewardProvider, + create_reward_provider, +) +from mlagents_envs.base_env import BehaviorSpec, ActionType +from mlagents.trainers.settings import GAILSettings, RewardSignalType +from mlagents.trainers.tests.test_reward_providers.utils import create_agent_buffer +from mlagents.trainers.reward_providers.gail_reward_provider import DiscriminatorNetwork + +CONTINUOUS_PATH = ( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir) + "/test.demo" +) +DISCRETE_PATH = ( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir) + + "/testdcvis.demo" +) +SEED = [42] + + +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)] +) +def test_construction(behavior_spec: BehaviorSpec) -> None: + gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) + gail_rp = GAILRewardProvider(behavior_spec, gail_settings) + assert gail_rp.name == "GAIL" + + +@pytest.mark.parametrize( + "behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)] +) +def test_factory(behavior_spec: BehaviorSpec) -> None: + gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + assert gail_rp.name == "GAIL" + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2), + BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), + ], +) +@pytest.mark.parametrize("use_actions", [False, True]) +@patch("mlagents.trainers.reward_providers.gail_reward_provider.demo_to_buffer") +def test_reward_decreases( + demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int +) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + buffer_expert = create_agent_buffer(behavior_spec, 1000) + buffer_policy = create_agent_buffer(behavior_spec, 1000) + demo_to_buffer.return_value = None, buffer_expert + gail_settings = GAILSettings( + demo_path="", learning_rate=0.05, use_vail=False, use_actions=use_actions + ) + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + + init_reward_expert = gail_rp.evaluate(buffer_expert)[0] + init_reward_policy = gail_rp.evaluate(buffer_policy)[0] + + for _ in range(10): + gail_rp.update(buffer_policy) + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert >= 0 # GAIL / VAIL reward always positive + assert reward_policy >= 0 + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert > reward_policy # Expert reward greater than non-expert reward + assert ( + reward_expert > init_reward_expert + ) # Expert reward getting better as network trains + assert ( + reward_policy < init_reward_policy + ) # Non-expert reward getting worse as network trains + + +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize( + "behavior_spec", + [ + BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2), + BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), + ], +) +@pytest.mark.parametrize("use_actions", [False, True]) +@patch("mlagents.trainers.reward_providers.gail_reward_provider.demo_to_buffer") +def test_reward_decreases_vail( + demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int +) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + buffer_expert = create_agent_buffer(behavior_spec, 1000) + buffer_policy = create_agent_buffer(behavior_spec, 1000) + demo_to_buffer.return_value = None, buffer_expert + gail_settings = GAILSettings( + demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions + ) + DiscriminatorNetwork.initial_beta = 0.0 + # we must set the initial value of beta to 0 for testing + # If we do not, the kl-loss will dominate early and will block the estimator + gail_rp = create_reward_provider( + RewardSignalType.GAIL, behavior_spec, gail_settings + ) + + for _ in range(100): + gail_rp.update(buffer_policy) + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert >= 0 # GAIL / VAIL reward always positive + assert reward_policy >= 0 + reward_expert = gail_rp.evaluate(buffer_expert)[0] + reward_policy = gail_rp.evaluate(buffer_policy)[0] + assert reward_expert > reward_policy # Expert reward greater than non-expert reward diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py new file mode 100644 index 0000000000..704225950d --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py @@ -0,0 +1,31 @@ +import numpy as np +from mlagents.trainers.buffer import AgentBuffer +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.trajectory import SplitObservations + + +def create_agent_buffer( + behavior_spec: BehaviorSpec, number: int, reward: float = 0.0 +) -> AgentBuffer: + buffer = AgentBuffer() + curr_observations = [ + np.random.normal(size=shape) for shape in behavior_spec.observation_shapes + ] + next_observations = [ + np.random.normal(size=shape) for shape in behavior_spec.observation_shapes + ] + action = behavior_spec.create_random_action(1)[0, :] + for _ in range(number): + curr_split_obs = SplitObservations.from_observations(curr_observations) + next_split_obs = SplitObservations.from_observations(next_observations) + for i, _ in enumerate(curr_split_obs.visual_observations): + buffer["visual_obs%d" % i].append(curr_split_obs.visual_observations[i]) + buffer["next_visual_obs%d" % i].append( + next_split_obs.visual_observations[i] + ) + buffer["vector_obs"].append(curr_split_obs.vector_observations) + buffer["next_vector_in"].append(next_split_obs.vector_observations) + buffer["actions"].append(action) + buffer["done"].append(np.zeros(1)) + buffer["reward"].append(np.ones(1) * reward) + return buffer From 2df477d904eb07ed6c77269bd1f7bccfdbbf7ad3 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Wed, 29 Jul 2020 15:33:06 -0700 Subject: [PATCH 02/10] Use NetworkBody to encode state in the reward providers --- .../curiosity_reward_provider.py | 168 +++++------------- .../reward_providers/gail_reward_provider.py | 95 +++++----- .../test_reward_providers/test_curiosity.py | 24 +-- .../tests/test_reward_providers/test_gail.py | 4 +- ml-agents/mlagents/trainers/torch/utils.py | 53 +++++- 5 files changed, 158 insertions(+), 186 deletions(-) diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index e74c83bd77..245769fb63 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -1,4 +1,3 @@ -from typing import List import numpy as np import torch @@ -7,86 +6,18 @@ from mlagents.trainers.settings import CuriositySettings from mlagents_envs.base_env import BehaviorSpec - - -def swish(x): - """ - TODO : MOVE SOMEWHERE BETTER - """ - return x * torch.sigmoid(x) - - -class Swish(torch.nn.Module): - """ - TODO : MOVE SOMEWHERE BETTER - """ - - def forward(self, data: torch.Tensor) -> torch.Tensor: - return torch.mul(data, torch.sigmoid(data)) - - -def actions_to_onehot( - discrete_actions: torch.Tensor, action_size: List[torch.Tensor] -) -> List[torch.Tensor]: - """ - Splits a discrete action Tensor (of integers) into its one hot representations. - Returns a list of Tensors (One Tensor per branch) - :param discrete_actions: A Tensor of discrete actions. - :param action_size: List of ints containing the number of possible actions for each branch. - :return: A list of one hot Tensors (one or each branch). - """ - onehot_branches = [ - torch.nn.functional.one_hot(_act.T, action_size[i]).float() - for i, _act in enumerate(discrete_actions.long().T) - ] - return onehot_branches - - -def break_into_branches( - concatenated_logits: torch.Tensor, action_size: List[torch.Tensor] -) -> List[torch.Tensor]: - """ - Takes a concatenated set of logits that represent multiple discrete action branches - and breaks it up into one Tensor per branch. - :param concatenated_logits: Tensor that represents the concatenated action branches - :param action_size: List of ints containing the number of possible actions for each branch. - :return: A List of Tensors containing one tensor per branch. - """ - action_idx = [0] + list(np.cumsum(action_size)) - branched_logits = [ - concatenated_logits[:, action_idx[i] : action_idx[i + 1]] - for i in range(len(action_size)) - ] - return branched_logits - - -def dynamic_partition( - data: torch.Tensor, partitions: torch.Tensor, num_partitions: int -) -> List[torch.Tensor]: - """ - Torch implementation of dynamic_partition : - https://www.tensorflow.org/api_docs/python/tf/dynamic_partition - Splits the data Tensor input into num_partitions Tensors according to the indices in - partitions. - :param data: The Tensor data that will be split into partitions. - :param partitions: An indices tensor that determines in which partition each element - of data will be in. - :param num_partitions: The number of partitions to output. Corresponds to the - maximum possible index in the partitions argument. - :return: A list of Tensor partitions (Their indices correspond to their partition index). - """ - res: List[torch.Tensor] = [] - for i in range(num_partitions): - res += [data[(partitions == i).nonzero().squeeze(1)]] - return res +from mlagents.trainers.torch.utils import ModelUtils +from mlagents.trainers.torch.networks import NetworkBody +from mlagents.trainers.settings import NetworkSettings, EncoderType class CuriosityRewardProvider(BaseRewardProvider): def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__(specs, settings) self._network = CuriosityNetwork(specs, settings) - params = list(self._network.parameters()) - self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate) + self.optimizer = torch.optim.Adam( + self._network.parameters(), lr=settings.learning_rate + ) def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: with torch.no_grad(): @@ -107,60 +38,57 @@ class CuriosityNetwork(torch.nn.Module): def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() - vec_obs_size = sum( - shape[0] for shape in specs.observation_shapes if len(shape) == 1 - ) - # vis_obs_shapes = [shape for shape in specs.observation_shapes if len(shape) == 3] self._policy_specs = specs + state_encoder_settings = NetworkSettings( + normalize=False, + hidden_units=settings.encoding_size, + num_layers=2, + vis_encode_type=EncoderType.SIMPLE, + memory=None, + ) + self._state_encoder = NetworkBody( + specs.observation_shapes, state_encoder_settings + ) - obs_size = vec_obs_size # Only vector for now - if obs_size > 0: - self.vec_encode_1 = torch.nn.Linear(obs_size, settings.encoding_size) - self.vec_encode_last = torch.nn.Linear( - settings.encoding_size, settings.encoding_size - ) - # TODO : The vector obs (Use networkBody from models_torch.py) - if self._policy_specs.is_action_continuous(): - self.inverse_model_action_predition = torch.nn.Linear( - 2 * settings.encoding_size, self._policy_specs.action_size - ) - self.forward_model_next_state_prediction = torch.nn.Linear( - settings.encoding_size + self._policy_specs.action_size, - settings.encoding_size, - ) - else: - self.inverse_model_action_predition = torch.nn.Linear( - 2 * settings.encoding_size, - sum(self._policy_specs.discrete_action_branches), - ) - self.forward_model_next_state_prediction = torch.nn.Linear( - settings.encoding_size - + sum(self._policy_specs.discrete_action_branches), - settings.encoding_size, - ) + self._action_flattener = ModelUtils.ActionFlattener(specs) + + self.inverse_model_action_predition = torch.nn.Linear( + 2 * settings.encoding_size, self._action_flattener.flattened_size + ) + + self.forward_model_next_state_prediction = torch.nn.Linear( + settings.encoding_size + self._action_flattener.flattened_size, + settings.encoding_size, + ) def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the current state embedding from a mini_batch. """ - hidden = self.vec_encode_1( - torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float) + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float)], + vis_inputs=[ + torch.as_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) + for i in range(n_vis) + ], ) - # TODO do visual - hidden = swish(hidden) - hidden = self.vec_encode_last(hidden) return hidden def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the next state embedding from a mini_batch. """ - hidden = self.vec_encode_1( - torch.as_tensor(mini_batch["next_vector_in"], dtype=torch.float) + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[ + torch.as_tensor(mini_batch["next_vector_in"], dtype=torch.float) + ], + vis_inputs=[ + torch.as_tensor(mini_batch["next_visual_obs%d" % i], dtype=torch.float) + for i in range(n_vis) + ], ) - # TODO do visual - hidden = swish(hidden) - hidden = self.vec_encode_last(hidden) return hidden def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -171,12 +99,12 @@ def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: inverse_model_input = torch.cat( (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1 ) - inverse_model_input = swish(inverse_model_input) + inverse_model_input = ModelUtils.swish(inverse_model_input) hidden = self.inverse_model_action_predition(inverse_model_input) if self._policy_specs.is_action_continuous(): return hidden else: - branches = break_into_branches( + branches = ModelUtils.break_into_branches( hidden, self._policy_specs.discrete_action_branches ) branches = [torch.softmax(b, dim=1) for b in branches] @@ -191,7 +119,7 @@ def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: action = torch.as_tensor(mini_batch["actions"], dtype=torch.float) else: action = torch.cat( - actions_to_onehot( + ModelUtils.actions_to_onehot( torch.as_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), @@ -200,7 +128,7 @@ def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: forward_model_input = torch.cat( (self.get_current_state(mini_batch), action), dim=1 ) - forward_model_input = swish(forward_model_input) + forward_model_input = ModelUtils.swish(forward_model_input) return self.forward_model_next_state_prediction(forward_model_input) def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -218,7 +146,7 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: return torch.mean(sq_difference) else: true_action = torch.cat( - actions_to_onehot( + ModelUtils.actions_to_onehot( torch.as_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), @@ -228,7 +156,7 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: -torch.log(predicted_action + self.EPSILON) * true_action, dim=1 ) return torch.mean( - dynamic_partition( + ModelUtils.dynamic_partition( cross_entropy, torch.as_tensor(mini_batch["action_mask"], dtype=torch.float), 2, diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py index 9c114afe1f..a079c38c95 100644 --- a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -6,10 +6,9 @@ from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider from mlagents.trainers.settings import GAILSettings from mlagents_envs.base_env import BehaviorSpec -from mlagents.trainers.reward_providers.curiosity_reward_provider import ( - actions_to_onehot, - Swish, -) +from mlagents.trainers.torch.utils import ModelUtils +from mlagents.trainers.torch.networks import NetworkBody +from mlagents.trainers.settings import NetworkSettings, EncoderType from mlagents.trainers.demo_loader import demo_to_buffer @@ -55,28 +54,29 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: self._policy_specs = specs self._settings = settings - # create the encoders for observations, actions and done - vec_obs_size = sum( - shape[0] for shape in specs.observation_shapes if len(shape) == 1 + state_encoder_settings = NetworkSettings( + normalize=False, + hidden_units=settings.encoding_size, + num_layers=2, + vis_encode_type=EncoderType.SIMPLE, + memory=None, ) - # vis_obs_shapes = [ - # shape for shape in specs.observation_shapes if len(shape) == 3 - # ] - - encoder_input_size = vec_obs_size # + visuals later and normalization - if settings.use_actions and specs.is_action_continuous(): - encoder_input_size += specs.action_size + 1 # + 1 is for done - if settings.use_actions and specs.is_action_discrete(): + self._state_encoder = NetworkBody( + specs.observation_shapes, state_encoder_settings + ) + + self._action_flattener = ModelUtils.ActionFlattener(specs) + + encoder_input_size = settings.encoding_size + if settings.use_actions: encoder_input_size += ( - sum(specs.discrete_action_branches) + 1 + self._action_flattener.flattened_size + 1 ) # + 1 is for done - self.encoder = torch.nn.ModuleList( - [ - torch.nn.Linear(encoder_input_size, settings.encoding_size), - Swish(), # use swish - torch.nn.Linear(settings.encoding_size, settings.encoding_size), - ] + self.encoder = torch.nn.Sequential( + torch.nn.Linear(encoder_input_size, settings.encoding_size), + ModelUtils.SwishLayer(), + torch.nn.Linear(settings.encoding_size, settings.encoding_size), ) estimator_input_size = settings.encoding_size if settings.use_vail: @@ -87,8 +87,8 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: self.z_mu_layer = torch.nn.Linear(settings.encoding_size, self.z_size) # self.mu_layer.weight.data Needs a variance scale initializer self.beta = torch.tensor(self.initial_beta) - self.estimator = torch.nn.ModuleList( - [torch.nn.Linear(estimator_input_size, 1), torch.nn.Sigmoid()] + self.estimator = torch.nn.Sequential( + torch.nn.Linear(estimator_input_size, 1), torch.nn.Sigmoid() ) def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -96,23 +96,23 @@ def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: Creates the action Tensor. In continuous case, corresponds to the action. In the discrete case, corresponds to the concatenation of one hot action Tensors. """ - if self._policy_specs.is_action_continuous(): - return torch.as_tensor(mini_batch["actions"], dtype=torch.float) - if self._policy_specs.is_action_discrete(): - return torch.cat( - actions_to_onehot( - torch.as_tensor(mini_batch["actions"], dtype=torch.long), - self._policy_specs.discrete_action_branches, - ), - dim=1, - ) + return self._action_flattener.forward( + torch.as_tensor(mini_batch["actions"], dtype=torch.float) + ) - def get_obs_input(self, mini_batch: AgentBuffer) -> torch.Tensor: + def get_state_encoding(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Creates the observation input. """ - # TODO : Visual observations - return torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float) + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float)], + vis_inputs=[ + torch.as_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) + for i in range(n_vis) + ], + ) + return hidden def compute_estimate( self, mini_batch: AgentBuffer, use_vail_noise: bool = False @@ -124,21 +124,17 @@ def compute_estimate( :param use_vail_noise: Only when using VAIL : If true, will sample the code, if false, will return the mean of the code. """ - encoder_input = self.get_obs_input(mini_batch) + encoder_input = self.get_state_encoding(mini_batch) if self._settings.use_actions: actions = self.get_action_input(mini_batch) dones = torch.as_tensor(mini_batch["done"], dtype=torch.float) encoder_input = torch.cat([encoder_input, actions, dones], dim=1) - for layer in self.encoder: - encoder_input = layer(encoder_input) - hidden = encoder_input + hidden = self.encoder(encoder_input) z_mu: Optional[torch.Tensor] = None if self._settings.use_vail: z_mu = self.z_mu_layer(hidden) hidden = torch.normal(z_mu, self.z_sigma * use_vail_noise) - for layer in self.estimator: - hidden = layer(hidden) - estimate = hidden + estimate = self.estimator(hidden) return estimate, z_mu def compute_loss( @@ -189,8 +185,8 @@ def compute_gradient_magnitude( Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. for off-policy. Compute gradients w.r.t randomly interpolated input. """ - policy_obs = self.get_obs_input(policy_batch) - expert_obs = self.get_obs_input(expert_batch) + policy_obs = self.get_state_encoding(policy_batch) + expert_obs = self.get_state_encoding(expert_batch) obs_epsilon = torch.rand(policy_obs.shape) encoder_input = obs_epsilon * policy_obs + (1 - obs_epsilon) * expert_obs if self._settings.use_actions: @@ -209,15 +205,12 @@ def compute_gradient_magnitude( ], dim=1, ) - for layer in self.encoder: - encoder_input = layer(encoder_input) - hidden = encoder_input + hidden = self.encoder(encoder_input) if self._settings.use_vail: use_vail_noise = True z_mu = self.z_mu_layer(hidden) hidden = torch.normal(z_mu, self.z_sigma * use_vail_noise) - for layer in self.estimator: - hidden = layer(hidden) + hidden = self.estimator(hidden) estimate = torch.mean(torch.sum(hidden, dim=1)) gradient = torch.autograd.grad(estimate, encoder_input)[0] # Norm's gradient could be NaN at 0. Use our own safe_norm diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py index 503fec3051..45ac0e3d9e 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py @@ -20,7 +20,7 @@ ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: - curiosity_settings = CuriositySettings(256, 0.01) + curiosity_settings = CuriositySettings(32, 0.01) curiosity_settings.strength = 0.1 curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) assert curiosity_rp.strength == 0.1 @@ -31,11 +31,13 @@ def test_construction(behavior_spec: BehaviorSpec) -> None: "behavior_spec", [ BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), - BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), ], ) def test_factory(behavior_spec: BehaviorSpec) -> None: - curiosity_settings = CuriositySettings(256, 0.01) + curiosity_settings = CuriositySettings(32, 0.01) curiosity_rp = create_reward_provider( RewardSignalType.CURIOSITY, behavior_spec, curiosity_settings ) @@ -46,7 +48,7 @@ def test_factory(behavior_spec: BehaviorSpec) -> None: @pytest.mark.parametrize( "behavior_spec", [ - BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5), BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), ], @@ -54,9 +56,9 @@ def test_factory(behavior_spec: BehaviorSpec) -> None: def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) - curiosity_settings = CuriositySettings(256, 0.01) + curiosity_settings = CuriositySettings(32, 0.01) curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) - buffer = create_agent_buffer(behavior_spec, 1000) + buffer = create_agent_buffer(behavior_spec, 5) reward_old = curiosity_rp.evaluate(buffer)[0] for _ in range(10): curiosity_rp.update(buffer) @@ -72,9 +74,9 @@ def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None: def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) - curiosity_settings = CuriositySettings(256, 0.1) + curiosity_settings = CuriositySettings(32, 0.1) curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) - buffer = create_agent_buffer(behavior_spec, 1000) + buffer = create_agent_buffer(behavior_spec, 5) for _ in range(200): curiosity_rp.update(buffer) prediction = curiosity_rp._network.predict_action(buffer)[0].detach() @@ -87,7 +89,7 @@ def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> @pytest.mark.parametrize( "behavior_spec", [ - BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5), + BehaviorSpec([(10,), (64, 66, 3)], ActionType.CONTINUOUS, 5), BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)), BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)), ], @@ -95,9 +97,9 @@ def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) - curiosity_settings = CuriositySettings(256, 0.1) + curiosity_settings = CuriositySettings(32, 0.1) curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) - buffer = create_agent_buffer(behavior_spec, 1000) + buffer = create_agent_buffer(behavior_spec, 5) for _ in range(100): curiosity_rp.update(buffer) prediction = curiosity_rp._network.predict_next_state(buffer)[0] diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py index ac6c473272..323ba7f3c9 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_gail.py @@ -47,7 +47,7 @@ def test_factory(behavior_spec: BehaviorSpec) -> None: @pytest.mark.parametrize( "behavior_spec", [ - BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2), + BehaviorSpec([(8,), (24, 26, 1)], ActionType.CONTINUOUS, 2), BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)), BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), ], @@ -94,7 +94,7 @@ def test_reward_decreases( "behavior_spec", [ BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2), - BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)), + BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3, 3, 3)), BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)), ], ) diff --git a/ml-agents/mlagents/trainers/torch/utils.py b/ml-agents/mlagents/trainers/torch/utils.py index d628ac3ebf..c8d92d12c1 100644 --- a/ml-agents/mlagents/trainers/torch/utils.py +++ b/ml-agents/mlagents/trainers/torch/utils.py @@ -12,6 +12,7 @@ ) from mlagents.trainers.settings import EncoderType from mlagents.trainers.exception import UnityTrainerException +from mlagents_envs.base_env import BehaviorSpec class ModelUtils: @@ -28,6 +29,33 @@ def swish(input_activation: torch.Tensor) -> torch.Tensor: """Swish activation function. For more info: https://arxiv.org/abs/1710.05941""" return torch.mul(input_activation, torch.sigmoid(input_activation)) + class SwishLayer(torch.nn.Module): + def forward(self, data: torch.Tensor) -> torch.Tensor: + return torch.mul(data, torch.sigmoid(data)) + + class ActionFlattener: + def __init__(self, behavior_spec: BehaviorSpec): + self._specs = behavior_spec + + @property + def flattened_size(self) -> int: + if self._specs.is_action_continuous(): + return self._specs.action_size + else: + return sum(self._specs.discrete_action_branches) + + def forward(self, action: torch.Tensor) -> torch.Tensor: + if self._specs.is_action_continuous(): + return action + else: + return torch.cat( + ModelUtils.actions_to_onehot( + torch.as_tensor(action, dtype=torch.long), + self._specs.discrete_action_branches, + ), + dim=1, + ) + @staticmethod def get_encoder_for_type(encoder_type: EncoderType) -> nn.Module: ENCODER_FUNCTION_BY_TYPE = { @@ -135,7 +163,28 @@ def actions_to_onehot( discrete_actions: torch.Tensor, action_size: List[int] ) -> List[torch.Tensor]: onehot_branches = [ - torch.nn.functional.one_hot(_act.T, action_size[i]) - for i, _act in enumerate(discrete_actions.T) + torch.nn.functional.one_hot(_act.T, action_size[i]).float() + for i, _act in enumerate(discrete_actions.long().T) ] return onehot_branches + + @staticmethod + def dynamic_partition( + data: torch.Tensor, partitions: torch.Tensor, num_partitions: int + ) -> List[torch.Tensor]: + """ + Torch implementation of dynamic_partition : + https://www.tensorflow.org/api_docs/python/tf/dynamic_partition + Splits the data Tensor input into num_partitions Tensors according to the indices in + partitions. + :param data: The Tensor data that will be split into partitions. + :param partitions: An indices tensor that determines in which partition each element + of data will be in. + :param num_partitions: The number of partitions to output. Corresponds to the + maximum possible index in the partitions argument. + :return: A list of Tensor partitions (Their indices correspond to their partition index). + """ + res: List[torch.Tensor] = [] + for i in range(num_partitions): + res += [data[(partitions == i).nonzero().squeeze(1)]] + return res From bf6f5877f94002e61004a576b4ee8f4eb349260e Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Wed, 29 Jul 2020 16:09:44 -0700 Subject: [PATCH 03/10] Integrating the reward prodiders with ppo and torch --- .../trainers/optimizer/torch_optimizer.py | 26 +++++++------------ .../mlagents/trainers/ppo/optimizer_torch.py | 3 +++ ml-agents/mlagents/trainers/ppo/trainer.py | 24 ++++++++++++----- .../curiosity_reward_provider.py | 1 + .../reward_providers/gail_reward_provider.py | 1 + .../mlagents/trainers/trainer/rl_trainer.py | 13 +++++++--- 6 files changed, 43 insertions(+), 25 deletions(-) diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index e76edb97b5..7a2b87f6e5 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -5,12 +5,11 @@ from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.components.bc.module import BCModule -from mlagents.trainers.components.reward_signals.extrinsic.signal import ( - ExtrinsicRewardSignal, -) +from mlagents.trainers.reward_providers import create_reward_provider + from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.optimizer import Optimizer -from mlagents.trainers.settings import TrainerSettings, RewardSignalType +from mlagents.trainers.settings import TrainerSettings from mlagents.trainers.trajectory import SplitObservations from mlagents.trainers.torch.utils import ModelUtils @@ -37,16 +36,11 @@ def create_reward_signals(self, reward_signal_configs): Create reward signals :param reward_signal_configs: Reward signal config. """ - extrinsic_signal = ExtrinsicRewardSignal( - self.policy, reward_signal_configs[RewardSignalType.EXTRINSIC] - ) - self.reward_signals = {RewardSignalType.EXTRINSIC.value: extrinsic_signal} - # Create reward signals - # for reward_signal, config in reward_signal_configs.items(): - # self.reward_signals[reward_signal] = create_reward_signal( - # self.policy, reward_signal, config - # ) - # self.update_dict.update(self.reward_signals[reward_signal].update_dict) + for reward_signal, settings in reward_signal_configs.items(): + # Name reward signals by string in case we have duplicates later + self.reward_signals[reward_signal.value] = create_reward_provider( + reward_signal, self.policy.behavior_spec, settings + ) def get_value_estimates( self, decision_requests: DecisionSteps, idx: int, done: bool @@ -72,7 +66,7 @@ def get_value_estimates( # If we're done, reassign all of the value estimates that need terminal states. if done: for k in value_estimates: - if self.reward_signals[k].use_terminal_states: + if not self.reward_signals[k].ignore_done: value_estimates[k] = 0.0 return value_estimates @@ -111,7 +105,7 @@ def get_trajectory_value_estimates( if done: for k in next_value_estimate: - if self.reward_signals[k].use_terminal_states: + if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 0c92f35d8b..17d58fdb71 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -149,4 +149,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: "Losses/Value Loss": value_loss.detach().cpu().numpy(), } + for reward_provider in self.reward_signals.values(): + reward_provider.update(batch) + return update_stats diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index f0f115497b..bb79e82477 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -86,18 +86,30 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: for name, v in value_estimates.items(): agent_buffer_trajectory[f"{name}_value_estimates"].extend(v) - self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].value_name, np.mean(v) - ) + if hasattr(self.optimizer.reward_signals[name], "value_name"): + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].value_name, np.mean(v) + ) + else: + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].name + "Value Estimate", + np.mean(v), + ) # Evaluate all reward functions self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - evaluate_result = reward_signal.evaluate_batch( - agent_buffer_trajectory - ).scaled_reward + if hasattr(reward_signal, "evaluate_batch"): + evaluate_result = reward_signal.evaluate_batch( + agent_buffer_trajectory + ).scaled_reward + else: + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) + * reward_signal.strength + ) agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index 245769fb63..2b9424740c 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -14,6 +14,7 @@ class CuriosityRewardProvider(BaseRewardProvider): def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__(specs, settings) + self._ignore_done = True self._network = CuriosityNetwork(specs, settings) self.optimizer = torch.optim.Adam( self._network.parameters(), lr=settings.learning_rate diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py index a079c38c95..85911e70a6 100644 --- a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -15,6 +15,7 @@ class GAILRewardProvider(BaseRewardProvider): def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__(specs, settings) + self._ignore_done = True self._discriminator_network = DiscriminatorNetwork(specs, settings) _, self._demo_buffer = demo_to_buffer( settings.demo_path, 1, specs diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index fdd90e11b3..03da6bb964 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -77,9 +77,16 @@ def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None self.reward_buffer.appendleft(rewards.get(agent_id, 0)) rewards[agent_id] = 0 else: - self.stats_reporter.add_stat( - optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0) - ) + if hasattr(optimizer.reward_signals[name], "stat_name"): + self.stats_reporter.add_stat( + optimizer.reward_signals[name].stat_name, + rewards.get(agent_id, 0), + ) + else: + self.stats_reporter.add_stat( + optimizer.reward_signals[name].name + "Reward", + rewards.get(agent_id, 0), + ) rewards[agent_id] = 0 def _clear_update_buffer(self) -> None: From 29f5d4a2f1b608fb0332a7a9cdd9d369ad773b7b Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Fri, 31 Jul 2020 11:11:41 -0700 Subject: [PATCH 04/10] work in progress, integration with PPO. Not training properly Pyramids at the moment --- .../curiosity_reward_provider.py | 46 +++++++++++++----- .../reward_providers/gail_reward_provider.py | 9 +++- .../mlagents/trainers/sac/optimizer_torch.py | 5 +- ml-agents/mlagents/trainers/sac/trainer.py | 47 +++++++++++++------ .../tests/test_reward_providers/utils.py | 1 + .../trainers/tests/test_reward_signals.py | 4 +- 6 files changed, 81 insertions(+), 31 deletions(-) diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index 2b9424740c..ca9cad6070 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -53,13 +53,18 @@ def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: self._action_flattener = ModelUtils.ActionFlattener(specs) - self.inverse_model_action_predition = torch.nn.Linear( - 2 * settings.encoding_size, self._action_flattener.flattened_size + self.inverse_model_action_predition = torch.nn.Sequential( + torch.nn.Linear(2 * settings.encoding_size, 256), + ModelUtils.SwishLayer(), + torch.nn.Linear(256, self._action_flattener.flattened_size), ) - self.forward_model_next_state_prediction = torch.nn.Linear( - settings.encoding_size + self._action_flattener.flattened_size, - settings.encoding_size, + self.forward_model_next_state_prediction = torch.nn.Sequential( + torch.nn.Linear( + settings.encoding_size + self._action_flattener.flattened_size, 256 + ), + ModelUtils.SwishLayer(), + torch.nn.Linear(256, settings.encoding_size), ) def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -100,7 +105,6 @@ def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: inverse_model_input = torch.cat( (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1 ) - inverse_model_input = ModelUtils.swish(inverse_model_input) hidden = self.inverse_model_action_predition(inverse_model_input) if self._policy_specs.is_action_continuous(): return hidden @@ -129,7 +133,7 @@ def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: forward_model_input = torch.cat( (self.get_current_state(mini_batch), action), dim=1 ) - forward_model_input = ModelUtils.swish(forward_model_input) + return self.forward_model_next_state_prediction(forward_model_input) def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -144,7 +148,13 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: - predicted_action ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) - return torch.mean(sq_difference) + return torch.mean( + ModelUtils.dynamic_partition( + sq_difference, + torch.as_tensor(mini_batch["masks"], dtype=torch.float), + 2, + )[1] + ) else: true_action = torch.cat( ModelUtils.actions_to_onehot( @@ -159,10 +169,13 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: return torch.mean( ModelUtils.dynamic_partition( cross_entropy, - torch.as_tensor(mini_batch["action_mask"], dtype=torch.float), + torch.as_tensor( + mini_batch["masks"], dtype=torch.float + ), # use masks not action_masks 2, )[1] ) + # return torch.mean(cross_entropy) def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: """ @@ -170,7 +183,9 @@ def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: between the predicted and actual next state. """ predicted_next_state = self.predict_next_state(mini_batch) - sq_difference = (self.get_next_state(mini_batch) - predicted_next_state) ** 2 + sq_difference = ( + 0.5 * (self.get_next_state(mini_batch) - predicted_next_state) ** 2 + ) sq_difference = torch.sum(sq_difference, dim=1) return sq_difference @@ -178,12 +193,21 @@ def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the loss for the next state prediction """ - return torch.mean(self.compute_reward(mini_batch)) + return torch.mean( + ModelUtils.dynamic_partition( + self.compute_reward(mini_batch), + torch.as_tensor(mini_batch["masks"], dtype=torch.float), + 2, + )[1] + ) def compute_losses(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the weighted sum of inverse and forward loss. """ + print( + self.compute_forward_loss(mini_batch), self.compute_inverse_loss(mini_batch) + ) return self.forward_loss_weight * self.compute_forward_loss( mini_batch ) + self.inverse_loss_weight * self.compute_inverse_loss(mini_batch) diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py index 85911e70a6..6dba81bdd4 100644 --- a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -28,8 +28,13 @@ def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: estimates, _ = self._discriminator_network.compute_estimate( mini_batch, use_vail_noise=False ) - return -torch.log( - 1.0 - estimates * (1.0 - self._discriminator_network.EPSILON) + return ( + -torch.log( + 1.0 - estimates * (1.0 - self._discriminator_network.EPSILON) + ) + .detach() + .cpu() + .numpy() ) def update(self, mini_batch: AgentBuffer) -> None: diff --git a/ml-agents/mlagents/trainers/sac/optimizer_torch.py b/ml-agents/mlagents/trainers/sac/optimizer_torch.py index b5653a9f65..40ee1eb250 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py @@ -89,7 +89,7 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings): # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()] self.use_dones_in_backup = { - name: int(self.reward_signals[name].use_terminal_states) + name: int(not self.reward_signals[name].ignore_done) for name in self.stream_names } @@ -461,6 +461,9 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: .numpy(), } + for signal in self.reward_signals.values(): + signal.update(batch) + return update_stats def update_reward_signals( diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index fea60e143d..d02ebddd13 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -139,9 +139,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - evaluate_result = reward_signal.evaluate_batch( - agent_buffer_trajectory - ).scaled_reward + if hasattr(reward_signal, "evaluate_batch"): + evaluate_result = reward_signal.evaluate_batch( + agent_buffer_trajectory + ).scaled_reward + else: + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) + * reward_signal.strength + ) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) @@ -150,9 +156,14 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached ) for name, v in value_estimates.items(): - self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].value_name, np.mean(v) - ) + if hasattr(self.optimizer.reward_signals[name], "value_name"): + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].value_name, np.mean(v) + ) + else: + self._stats_reporter.add_stat( + self.optimizer.reward_signals[name].name + "Value", np.mean(v) + ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. @@ -273,9 +284,14 @@ def _update_sac_policy(self) -> bool: ) # Get rewards for each reward for name, signal in self.optimizer.reward_signals.items(): - sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch( - sampled_minibatch - ).scaled_reward + if hasattr(signal, "evaluate_batch"): + sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch( + sampled_minibatch + ).scaled_reward + else: + sampled_minibatch[f"{name}_rewards"] = ( + signal.evaluate(sampled_minibatch) * signal.strength + ) update_stats = self.optimizer.update(sampled_minibatch, n_sequences) for stat_name, value in update_stats.items(): @@ -322,12 +338,13 @@ def _update_reward_signals(self) -> None: reward_signal_minibatches = {} for name, signal in self.optimizer.reward_signals.items(): logger.debug(f"Updating {name} at step {self.step}") - # Some signals don't need a minibatch to be sampled - so we don't! - if signal.update_dict: - reward_signal_minibatches[name] = buffer.sample_mini_batch( - self.hyperparameters.batch_size, - sequence_length=self.policy.sequence_length, - ) + if hasattr(signal, "update_dict"): + # Some signals don't need a minibatch to be sampled - so we don't! + if signal.update_dict: + reward_signal_minibatches[name] = buffer.sample_mini_batch( + self.hyperparameters.batch_size, + sequence_length=self.policy.sequence_length, + ) update_stats = self.optimizer.update_reward_signals( reward_signal_minibatches, n_sequences ) diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py index 704225950d..c50ab68a6a 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/utils.py @@ -28,4 +28,5 @@ def create_agent_buffer( buffer["actions"].append(action) buffer["done"].append(np.zeros(1)) buffer["reward"].append(np.ones(1) * reward) + buffer["masks"].append(np.ones(1)) return buffer diff --git a/ml-agents/mlagents/trainers/tests/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/test_reward_signals.py index d13cb2674b..5ccbfe8836 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py @@ -4,7 +4,7 @@ import mlagents.trainers.tests.mock_brain as mb from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.sac.optimizer import SACOptimizer -from mlagents.trainers.ppo.optimizer import PPOOptimizer +from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG from mlagents.trainers.settings import ( GAILSettings, @@ -75,7 +75,7 @@ def create_optimizer_mock( if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: - optimizer = PPOOptimizer(policy, trainer_settings) + optimizer = TFPPOOptimizer(policy, trainer_settings) return optimizer From 0a5e4c9d1751edb2fabd45c7d3555e84bd43fbfe Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Mon, 3 Aug 2020 10:44:54 -0700 Subject: [PATCH 05/10] Integration in PPO --- .../mlagents/trainers/ppo/optimizer_torch.py | 2 +- ml-agents/mlagents/trainers/ppo/trainer.py | 3 +- .../base_reward_provider copy.py | 72 ++++++++ .../reward_providers/base_reward_provider.py | 4 +- .../curiosity_reward_provider.py | 155 ++++++++++-------- .../extrinsic_reward_provider.py | 5 +- .../reward_providers/gail_reward_provider.py | 5 +- ml-agents/mlagents/trainers/sac/trainer.py | 2 +- .../test_reward_providers/test_curiosity.py | 1 + ml-agents/mlagents/trainers/torch/encoders.py | 9 +- .../mlagents/trainers/trainer/rl_trainer.py | 3 +- 11 files changed, 182 insertions(+), 79 deletions(-) create mode 100644 ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 17d58fdb71..f8aa8d9c7a 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -150,6 +150,6 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: } for reward_provider in self.reward_signals.values(): - reward_provider.update(batch) + update_stats.update(reward_provider.update(batch)) return update_stats diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index bb79e82477..a3142b14f8 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -77,6 +77,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: if self.is_training: self.policy.update_normalization(agent_buffer_trajectory["vector_obs"]) + # Get all value estimates value_estimates, value_next = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, @@ -92,7 +93,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: ) else: self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].name + "Value Estimate", + "Policy/"+self.optimizer.reward_signals[name].name + " Value Estimate", np.mean(v), ) diff --git a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py new file mode 100644 index 0000000000..77b7fa7a30 --- /dev/null +++ b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py @@ -0,0 +1,72 @@ +import numpy as np +from abc import ABC, abstractmethod +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.settings import RewardSignalSettings +from mlagents_envs.base_env import BehaviorSpec + + +class BaseRewardProvider(ABC): + def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: + self._policy_specs = specs + self._gamma = settings.gamma + self._strength = settings.strength + self._ignore_done = False + + @property + def gamma(self) -> float: + """ + The discount factor for the reward signal + """ + return self._gamma + + @property + def strength(self) -> float: + """ + The strength multiplier of the reward provider + """ + return self._strength + + @property + def name(self) -> str: + """ + The name of the reward provider. Is used for reporting and identification + """ + class_name = self.__class__.__name__ + return class_name.replace("RewardProvider", "") + + @property + def ignore_done(self) -> bool: + """ + If true, when the agent is done, the rewards of the next episode must be + used to calculate the return of the current episode. + Is used to mitigate the positive bias in rewards with no natural end. + """ + return self._ignore_done + + @abstractmethod + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + """ + Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: a np.ndarray of rewards generated by the reward provider + """ + raise NotImplementedError( + "The reward provider's evaluate method has not been implemented " + ) + + @abstractmethod + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + """ + Update the reward for the data present in the Dict mini_batch. Use this when updating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: A dictionary from string to stats values + """ + raise NotImplementedError( + "The reward provider's update method has not been implemented " + ) diff --git a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py index 83e3afb0c1..77b7fa7a30 100644 --- a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider.py @@ -1,5 +1,6 @@ import numpy as np from abc import ABC, abstractmethod +from typing import Dict from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.settings import RewardSignalSettings @@ -58,12 +59,13 @@ def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: ) @abstractmethod - def update(self, mini_batch: AgentBuffer) -> None: + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: """ Update the reward for the data present in the Dict mini_batch. Use this when updating a reward function drawn straight from a Buffer. :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) when drawing from the update buffer. + :return: A dictionary from string to stats values """ raise NotImplementedError( "The reward provider's update method has not been implemented " diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index ca9cad6070..a68ea3ff2b 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -1,4 +1,5 @@ import numpy as np +from typing import Dict import torch from mlagents.trainers.buffer import AgentBuffer @@ -12,6 +13,9 @@ class CuriosityRewardProvider(BaseRewardProvider): + beta = 0.2 # Forward loss weight + loss_multiplier = 10.0 # Loss multiplier + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__(specs, settings) self._ignore_done = True @@ -19,82 +23,111 @@ def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: self.optimizer = torch.optim.Adam( self._network.parameters(), lr=settings.learning_rate ) + self._has_updated_once = False def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: with torch.no_grad(): - rewards = self._network.compute_reward(mini_batch) - return rewards.detach().cpu().numpy() + rewards = self._network.compute_reward(mini_batch).detach().cpu().numpy() + rewards = np.minimum(rewards, 1.0 / self.strength) + return rewards * self._has_updated_once + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + self._has_updated_once = True + forward_loss = self._network.compute_forward_loss(mini_batch) + inverse_loss = self._network.compute_inverse_loss(mini_batch) - def update(self, mini_batch: AgentBuffer) -> None: - loss = self._network.compute_losses(mini_batch) + loss = self.loss_multiplier * (self.beta * forward_loss + (1.0 - self.beta) * inverse_loss) self.optimizer.zero_grad() - loss.backward() + loss.backward() #retain_graph=True) self.optimizer.step() + return {"Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(), + "Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy()} class CuriosityNetwork(torch.nn.Module): EPSILON = 1e-10 - forward_loss_weight = 2.0 - inverse_loss_weight = 8.0 def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() self._policy_specs = specs - state_encoder_settings = NetworkSettings( - normalize=False, - hidden_units=settings.encoding_size, - num_layers=2, - vis_encode_type=EncoderType.SIMPLE, - memory=None, - ) - self._state_encoder = NetworkBody( - specs.observation_shapes, state_encoder_settings + # state_encoder_settings = NetworkSettings( + # normalize=False, + # hidden_units=settings.encoding_size, + # num_layers=2, + # vis_encode_type=EncoderType.SIMPLE, + # memory=None, + # ) + # self._state_encoder = NetworkBody( + # specs.observation_shapes, state_encoder_settings + # ) + self._state_encoder = torch.nn.Sequential( + torch.nn.Linear(172, settings.encoding_size), + ModelUtils.SwishLayer(), + torch.nn.Linear(settings.encoding_size, settings.encoding_size), + ModelUtils.SwishLayer(), + # torch.nn.Linear(256, self._action_flattener.flattened_size) ) + torch.nn.init.xavier_uniform_(self._state_encoder[0].weight.data) + torch.nn.init.xavier_uniform_(self._state_encoder[2].weight.data) + self._state_encoder[0].bias.data.zero_() + self._state_encoder[2].bias.data.zero_() self._action_flattener = ModelUtils.ActionFlattener(specs) self.inverse_model_action_predition = torch.nn.Sequential( torch.nn.Linear(2 * settings.encoding_size, 256), ModelUtils.SwishLayer(), - torch.nn.Linear(256, self._action_flattener.flattened_size), + # torch.nn.Linear(256, 256), + # ModelUtils.SwishLayer(), + torch.nn.Linear(256, self._action_flattener.flattened_size) ) + torch.nn.init.xavier_normal_(self.inverse_model_action_predition[0].weight.data) + torch.nn.init.xavier_normal_(self.inverse_model_action_predition[2].weight.data) + self.inverse_model_action_predition[0].bias.data.zero_() + self.inverse_model_action_predition[2].bias.data.zero_() self.forward_model_next_state_prediction = torch.nn.Sequential( - torch.nn.Linear( - settings.encoding_size + self._action_flattener.flattened_size, 256 - ), + torch.nn.Linear(settings.encoding_size + self._action_flattener.flattened_size, 256), ModelUtils.SwishLayer(), - torch.nn.Linear(256, settings.encoding_size), + # torch.nn.Linear(256, 256), + # ModelUtils.SwishLayer(), + torch.nn.Linear(256, settings.encoding_size) ) + torch.nn.init.xavier_normal_(self.forward_model_next_state_prediction[0].weight.data) + torch.nn.init.xavier_normal_(self.forward_model_next_state_prediction[2].weight.data) + self.forward_model_next_state_prediction[0].bias.data.zero_() + self.forward_model_next_state_prediction[2].bias.data.zero_() def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the current state embedding from a mini_batch. """ - n_vis = len(self._state_encoder.visual_encoders) - hidden, _ = self._state_encoder.forward( - vec_inputs=[torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float)], - vis_inputs=[ - torch.as_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) - for i in range(n_vis) - ], - ) + n_vis = 0#len(self._state_encoder.visual_encoders) + # hidden, _ = self._state_encoder.forward( + # vec_inputs=[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)], + # vis_inputs=[ + # ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) + # for i in range(n_vis) + # ], + # ) + hidden = self._state_encoder.forward(ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)) return hidden def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the next state embedding from a mini_batch. """ - n_vis = len(self._state_encoder.visual_encoders) - hidden, _ = self._state_encoder.forward( - vec_inputs=[ - torch.as_tensor(mini_batch["next_vector_in"], dtype=torch.float) - ], - vis_inputs=[ - torch.as_tensor(mini_batch["next_visual_obs%d" % i], dtype=torch.float) - for i in range(n_vis) - ], - ) + # n_vis = 0#len(self._state_encoder.visual_encoders) + # hidden, _ = self._state_encoder.forward( + # vec_inputs=[ + # ModelUtils.list_to_tensor(mini_batch["next_vector_in"], dtype=torch.float) + # ], + # vis_inputs=[ + # ModelUtils.list_to_tensor(mini_batch["next_visual_obs%d" % i], dtype=torch.float) + # for i in range(n_vis) + # ], + # ) + hidden = self._state_encoder.forward(ModelUtils.list_to_tensor(mini_batch["next_vector_in"], dtype=torch.float)) return hidden def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -121,18 +154,18 @@ def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: the next state embedding. """ if self._policy_specs.is_action_continuous(): - action = torch.as_tensor(mini_batch["actions"], dtype=torch.float) + action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) else: action = torch.cat( ModelUtils.actions_to_onehot( - torch.as_tensor(mini_batch["actions"], dtype=torch.long), + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, ) forward_model_input = torch.cat( - (self.get_current_state(mini_batch), action), dim=1 - ) + (self.get_current_state(mini_batch), action), dim=1 + ) return self.forward_model_next_state_prediction(forward_model_input) @@ -144,21 +177,20 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: predicted_action = self.predict_action(mini_batch) if self._policy_specs.is_action_continuous(): sq_difference = ( - torch.as_tensor(mini_batch["actions"], dtype=torch.float) + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) - predicted_action ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) return torch.mean( ModelUtils.dynamic_partition( sq_difference, - torch.as_tensor(mini_batch["masks"], dtype=torch.float), + ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, - )[1] - ) + )[1]) else: true_action = torch.cat( ModelUtils.actions_to_onehot( - torch.as_tensor(mini_batch["actions"], dtype=torch.long), + ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, @@ -169,9 +201,7 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: return torch.mean( ModelUtils.dynamic_partition( cross_entropy, - torch.as_tensor( - mini_batch["masks"], dtype=torch.float - ), # use masks not action_masks + ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), # use masks not action_masks 2, )[1] ) @@ -183,9 +213,9 @@ def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: between the predicted and actual next state. """ predicted_next_state = self.predict_next_state(mini_batch) - sq_difference = ( - 0.5 * (self.get_next_state(mini_batch) - predicted_next_state) ** 2 - ) + # with torch.no_grad(): + target = self.get_next_state(mini_batch) + sq_difference = 0.5 * (target - predicted_next_state) ** 2 sq_difference = torch.sum(sq_difference, dim=1) return sq_difference @@ -196,18 +226,5 @@ def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: return torch.mean( ModelUtils.dynamic_partition( self.compute_reward(mini_batch), - torch.as_tensor(mini_batch["masks"], dtype=torch.float), - 2, - )[1] - ) - - def compute_losses(self, mini_batch: AgentBuffer) -> torch.Tensor: - """ - Computes the weighted sum of inverse and forward loss. - """ - print( - self.compute_forward_loss(mini_batch), self.compute_inverse_loss(mini_batch) - ) - return self.forward_loss_weight * self.compute_forward_loss( - mini_batch - ) + self.inverse_loss_weight * self.compute_inverse_loss(mini_batch) + ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), + 2,)[1]) diff --git a/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py index d57cd2f893..dff522cb0d 100644 --- a/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/extrinsic_reward_provider.py @@ -1,4 +1,5 @@ import numpy as np +from typing import Dict from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.reward_providers.base_reward_provider import BaseRewardProvider @@ -8,5 +9,5 @@ class ExtrinsicRewardProvider(BaseRewardProvider): def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: return np.array(mini_batch["environment_rewards"], dtype=np.float32) - def update(self, mini_batch: AgentBuffer) -> None: - pass + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + return {} diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py index 6dba81bdd4..6924590948 100644 --- a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict import numpy as np import torch @@ -37,7 +37,7 @@ def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: .numpy() ) - def update(self, mini_batch: AgentBuffer) -> None: + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: expert_batch = self._demo_buffer.sample_mini_batch( mini_batch.num_experiences, 1 ) @@ -45,6 +45,7 @@ def update(self, mini_batch: AgentBuffer) -> None: self.optimizer.zero_grad() loss.backward() self.optimizer.step() + return {} class DiscriminatorNetwork(torch.nn.Module): diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index d02ebddd13..aa789135e7 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -162,7 +162,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: ) else: self._stats_reporter.add_stat( - self.optimizer.reward_signals[name].name + "Value", np.mean(v) + "Policy/" + self.optimizer.reward_signals[name].name + " Value", np.mean(v) ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. diff --git a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py index 45ac0e3d9e..df45846f48 100644 --- a/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py +++ b/ml-agents/mlagents/trainers/tests/test_reward_providers/test_curiosity.py @@ -59,6 +59,7 @@ def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None: curiosity_settings = CuriositySettings(32, 0.01) curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings) buffer = create_agent_buffer(behavior_spec, 5) + curiosity_rp.update(buffer) reward_old = curiosity_rp.evaluate(buffer)[0] for _ in range(10): curiosity_rp.update(buffer) diff --git a/ml-agents/mlagents/trainers/torch/encoders.py b/ml-agents/mlagents/trainers/torch/encoders.py index 0607fbcca5..5be263fbfe 100644 --- a/ml-agents/mlagents/trainers/torch/encoders.py +++ b/ml-agents/mlagents/trainers/torch/encoders.py @@ -63,6 +63,11 @@ def pool_out_shape(h_w: Tuple[int, int], kernel_size: int) -> Tuple[int, int]: return height, width +class SwishLayer(torch.nn.Module): + def forward(self, data: torch.Tensor) -> torch.Tensor: + return torch.mul(data, torch.sigmoid(data)) + + class VectorEncoder(nn.Module): def __init__( self, @@ -74,12 +79,14 @@ def __init__( self.normalizer: Optional[Normalizer] = None super().__init__() self.layers = [nn.Linear(input_size, hidden_size)] + self.layers.append(SwishLayer()) if normalize: self.normalizer = Normalizer(input_size) for _ in range(num_layers - 1): self.layers.append(nn.Linear(hidden_size, hidden_size)) - self.layers.append(nn.ReLU()) + self.layers.append(SwishLayer()) + # self.layers.append(nn.ReLU()) self.seq_layers = nn.Sequential(*self.layers) def forward(self, inputs: torch.Tensor) -> None: diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index 03da6bb964..b9eddaa55b 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -51,6 +51,7 @@ def __init__(self, *args, **kwargs): StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() ) self.framework = "torch" if TestingConfiguration.use_torch else "tf" + # self.framework = "tf" if TestingConfiguration.max_steps > 0: self.trainer_settings.max_steps = TestingConfiguration.max_steps self._next_save_step = 0 @@ -84,7 +85,7 @@ def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None ) else: self.stats_reporter.add_stat( - optimizer.reward_signals[name].name + "Reward", + "Policy/"+optimizer.reward_signals[name].name + " Reward", rewards.get(agent_id, 0), ) rewards[agent_id] = 0 From a011362a139a09268ba2d1da68b4d2d112498028 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Mon, 3 Aug 2020 10:46:52 -0700 Subject: [PATCH 06/10] Removing duplicate file --- .../base_reward_provider copy.py | 72 ------------------- 1 file changed, 72 deletions(-) delete mode 100644 ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py diff --git a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py b/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py deleted file mode 100644 index 77b7fa7a30..0000000000 --- a/ml-agents/mlagents/trainers/reward_providers/base_reward_provider copy.py +++ /dev/null @@ -1,72 +0,0 @@ -import numpy as np -from abc import ABC, abstractmethod -from typing import Dict - -from mlagents.trainers.buffer import AgentBuffer -from mlagents.trainers.settings import RewardSignalSettings -from mlagents_envs.base_env import BehaviorSpec - - -class BaseRewardProvider(ABC): - def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: - self._policy_specs = specs - self._gamma = settings.gamma - self._strength = settings.strength - self._ignore_done = False - - @property - def gamma(self) -> float: - """ - The discount factor for the reward signal - """ - return self._gamma - - @property - def strength(self) -> float: - """ - The strength multiplier of the reward provider - """ - return self._strength - - @property - def name(self) -> str: - """ - The name of the reward provider. Is used for reporting and identification - """ - class_name = self.__class__.__name__ - return class_name.replace("RewardProvider", "") - - @property - def ignore_done(self) -> bool: - """ - If true, when the agent is done, the rewards of the next episode must be - used to calculate the return of the current episode. - Is used to mitigate the positive bias in rewards with no natural end. - """ - return self._ignore_done - - @abstractmethod - def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: - """ - Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward - function drawn straight from a Buffer. - :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) - when drawing from the update buffer. - :return: a np.ndarray of rewards generated by the reward provider - """ - raise NotImplementedError( - "The reward provider's evaluate method has not been implemented " - ) - - @abstractmethod - def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: - """ - Update the reward for the data present in the Dict mini_batch. Use this when updating a reward - function drawn straight from a Buffer. - :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) - when drawing from the update buffer. - :return: A dictionary from string to stats values - """ - raise NotImplementedError( - "The reward provider's update method has not been implemented " - ) From 8f9f3488e88f6041aff4aa13f187059e51dd9323 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Tue, 4 Aug 2020 10:17:37 -0700 Subject: [PATCH 07/10] Gail and Curiosity working --- .../curiosity_reward_provider.py | 126 +++++++++--------- .../reward_providers/gail_reward_provider.py | 49 +++++-- .../mlagents/trainers/torch/distributions.py | 2 +- 3 files changed, 105 insertions(+), 72 deletions(-) diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index a68ea3ff2b..754b0fe1ba 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -13,7 +13,7 @@ class CuriosityRewardProvider(BaseRewardProvider): - beta = 0.2 # Forward loss weight + beta = 0.2 # Forward vs Inverse loss weight loss_multiplier = 10.0 # Loss multiplier def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: @@ -36,12 +36,16 @@ def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: forward_loss = self._network.compute_forward_loss(mini_batch) inverse_loss = self._network.compute_inverse_loss(mini_batch) - loss = self.loss_multiplier * (self.beta * forward_loss + (1.0 - self.beta) * inverse_loss) + loss = self.loss_multiplier * ( + self.beta * forward_loss + (1.0 - self.beta) * inverse_loss + ) self.optimizer.zero_grad() - loss.backward() #retain_graph=True) + loss.backward() self.optimizer.step() - return {"Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(), - "Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy()} + return { + "Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(), + "Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy(), + } class CuriosityNetwork(torch.nn.Module): @@ -50,36 +54,23 @@ class CuriosityNetwork(torch.nn.Module): def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() self._policy_specs = specs - # state_encoder_settings = NetworkSettings( - # normalize=False, - # hidden_units=settings.encoding_size, - # num_layers=2, - # vis_encode_type=EncoderType.SIMPLE, - # memory=None, - # ) - # self._state_encoder = NetworkBody( - # specs.observation_shapes, state_encoder_settings - # ) - self._state_encoder = torch.nn.Sequential( - torch.nn.Linear(172, settings.encoding_size), - ModelUtils.SwishLayer(), - torch.nn.Linear(settings.encoding_size, settings.encoding_size), - ModelUtils.SwishLayer(), - # torch.nn.Linear(256, self._action_flattener.flattened_size) + state_encoder_settings = NetworkSettings( + normalize=False, + hidden_units=settings.encoding_size, + num_layers=2, + vis_encode_type=EncoderType.SIMPLE, + memory=None, + ) + self._state_encoder = NetworkBody( + specs.observation_shapes, state_encoder_settings ) - torch.nn.init.xavier_uniform_(self._state_encoder[0].weight.data) - torch.nn.init.xavier_uniform_(self._state_encoder[2].weight.data) - self._state_encoder[0].bias.data.zero_() - self._state_encoder[2].bias.data.zero_() self._action_flattener = ModelUtils.ActionFlattener(specs) self.inverse_model_action_predition = torch.nn.Sequential( torch.nn.Linear(2 * settings.encoding_size, 256), ModelUtils.SwishLayer(), - # torch.nn.Linear(256, 256), - # ModelUtils.SwishLayer(), - torch.nn.Linear(256, self._action_flattener.flattened_size) + torch.nn.Linear(256, self._action_flattener.flattened_size), ) torch.nn.init.xavier_normal_(self.inverse_model_action_predition[0].weight.data) torch.nn.init.xavier_normal_(self.inverse_model_action_predition[2].weight.data) @@ -87,14 +78,18 @@ def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: self.inverse_model_action_predition[2].bias.data.zero_() self.forward_model_next_state_prediction = torch.nn.Sequential( - torch.nn.Linear(settings.encoding_size + self._action_flattener.flattened_size, 256), + torch.nn.Linear( + settings.encoding_size + self._action_flattener.flattened_size, 256 + ), ModelUtils.SwishLayer(), - # torch.nn.Linear(256, 256), - # ModelUtils.SwishLayer(), - torch.nn.Linear(256, settings.encoding_size) + torch.nn.Linear(256, settings.encoding_size), + ) + torch.nn.init.xavier_normal_( + self.forward_model_next_state_prediction[0].weight.data + ) + torch.nn.init.xavier_normal_( + self.forward_model_next_state_prediction[2].weight.data ) - torch.nn.init.xavier_normal_(self.forward_model_next_state_prediction[0].weight.data) - torch.nn.init.xavier_normal_(self.forward_model_next_state_prediction[2].weight.data) self.forward_model_next_state_prediction[0].bias.data.zero_() self.forward_model_next_state_prediction[2].bias.data.zero_() @@ -102,32 +97,38 @@ def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the current state embedding from a mini_batch. """ - n_vis = 0#len(self._state_encoder.visual_encoders) - # hidden, _ = self._state_encoder.forward( - # vec_inputs=[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)], - # vis_inputs=[ - # ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) - # for i in range(n_vis) - # ], - # ) - hidden = self._state_encoder.forward(ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)) + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[ + ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float) + ], + vis_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["visual_obs%d" % i], dtype=torch.float + ) + for i in range(n_vis) + ], + ) return hidden def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the next state embedding from a mini_batch. """ - # n_vis = 0#len(self._state_encoder.visual_encoders) - # hidden, _ = self._state_encoder.forward( - # vec_inputs=[ - # ModelUtils.list_to_tensor(mini_batch["next_vector_in"], dtype=torch.float) - # ], - # vis_inputs=[ - # ModelUtils.list_to_tensor(mini_batch["next_visual_obs%d" % i], dtype=torch.float) - # for i in range(n_vis) - # ], - # ) - hidden = self._state_encoder.forward(ModelUtils.list_to_tensor(mini_batch["next_vector_in"], dtype=torch.float)) + n_vis = len(self._state_encoder.visual_encoders) + hidden, _ = self._state_encoder.forward( + vec_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["next_vector_in"], dtype=torch.float + ) + ], + vis_inputs=[ + ModelUtils.list_to_tensor( + mini_batch["next_visual_obs%d" % i], dtype=torch.float + ) + for i in range(n_vis) + ], + ) return hidden def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor: @@ -164,8 +165,8 @@ def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: dim=1, ) forward_model_input = torch.cat( - (self.get_current_state(mini_batch), action), dim=1 - ) + (self.get_current_state(mini_batch), action), dim=1 + ) return self.forward_model_next_state_prediction(forward_model_input) @@ -186,7 +187,8 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: sq_difference, ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, - )[1]) + )[1] + ) else: true_action = torch.cat( ModelUtils.actions_to_onehot( @@ -201,11 +203,12 @@ def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: return torch.mean( ModelUtils.dynamic_partition( cross_entropy, - ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), # use masks not action_masks + ModelUtils.list_to_tensor( + mini_batch["masks"], dtype=torch.float + ), # use masks not action_masks 2, )[1] ) - # return torch.mean(cross_entropy) def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: """ @@ -213,7 +216,6 @@ def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: between the predicted and actual next state. """ predicted_next_state = self.predict_next_state(mini_batch) - # with torch.no_grad(): target = self.get_next_state(mini_batch) sq_difference = 0.5 * (target - predicted_next_state) ** 2 sq_difference = torch.sum(sq_difference, dim=1) @@ -227,4 +229,6 @@ def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: ModelUtils.dynamic_partition( self.compute_reward(mini_batch), ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), - 2,)[1]) + 2, + )[1] + ) diff --git a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py index 6924590948..750965de88 100644 --- a/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/gail_reward_provider.py @@ -30,7 +30,9 @@ def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: ) return ( -torch.log( - 1.0 - estimates * (1.0 - self._discriminator_network.EPSILON) + 1.0 + - estimates.squeeze(dim=1) + * (1.0 - self._discriminator_network.EPSILON) ) .detach() .cpu() @@ -41,11 +43,23 @@ def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: expert_batch = self._demo_buffer.sample_mini_batch( mini_batch.num_experiences, 1 ) - loss = self._discriminator_network.compute_loss(mini_batch, expert_batch) + loss, policy_mean_estimate, expert_mean_estimate, kl_loss = self._discriminator_network.compute_loss( + mini_batch, expert_batch + ) self.optimizer.zero_grad() loss.backward() self.optimizer.step() - return {} + stats_dict = { + "Losses/GAIL Discriminator Loss": loss.detach().cpu().numpy(), + "Policy/GAIL Policy Estimate": policy_mean_estimate.detach().cpu().numpy(), + "Policy/GAIL Expert Estimate": expert_mean_estimate.detach().cpu().numpy(), + } + if self._discriminator_network.use_vail: + stats_dict["Policy/GAIL Beta"] = ( + self._discriminator_network.beta.detach().cpu().numpy() + ) + stats_dict["Losses/GAIL KL Loss"] = kl_loss.detach().cpu().numpy() + return stats_dict class DiscriminatorNetwork(torch.nn.Module): @@ -54,11 +68,12 @@ class DiscriminatorNetwork(torch.nn.Module): alpha = 0.0005 mutual_information = 0.5 EPSILON = 1e-7 - initial_beta = 1.0 + initial_beta = 0.0 def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__() self._policy_specs = specs + self.use_vail = settings.use_vail self._settings = settings state_encoder_settings = NetworkSettings( @@ -84,7 +99,13 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: torch.nn.Linear(encoder_input_size, settings.encoding_size), ModelUtils.SwishLayer(), torch.nn.Linear(settings.encoding_size, settings.encoding_size), + ModelUtils.SwishLayer(), ) + torch.nn.init.xavier_normal_(self.encoder[0].weight.data) + torch.nn.init.xavier_normal_(self.encoder[2].weight.data) + self.encoder[0].bias.data.zero_() + self.encoder[2].bias.data.zero_() + estimator_input_size = settings.encoding_size if settings.use_vail: estimator_input_size = self.z_size @@ -92,11 +113,18 @@ def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: torch.ones((self.z_size), dtype=torch.float), requires_grad=True ) self.z_mu_layer = torch.nn.Linear(settings.encoding_size, self.z_size) - # self.mu_layer.weight.data Needs a variance scale initializer - self.beta = torch.tensor(self.initial_beta) + # self.z_mu_layer.weight.data Needs a variance scale initializer + torch.nn.init.xavier_normal_(self.z_mu_layer.weight.data) + self.z_mu_layer.bias.data.zero_() + self.beta = torch.nn.Parameter( + torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False + ) + self.estimator = torch.nn.Sequential( torch.nn.Linear(estimator_input_size, 1), torch.nn.Sigmoid() ) + torch.nn.init.xavier_normal_(self.estimator[0].weight.data) + self.estimator[0].bias.data.zero_() def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: """ @@ -157,9 +185,10 @@ def compute_loss( expert_batch, use_vail_noise=True ) loss = -( - (expert_estimate * (1 - self.EPSILON)).log() - + (1.0 - policy_estimate * (1 - self.EPSILON)).log() + torch.log(expert_estimate * (1 - self.EPSILON)) + + torch.log(1.0 - policy_estimate * (1 - self.EPSILON)) ).mean() + kl_loss: Optional[torch.Tensor] = None if self._settings.use_vail: # KL divergence loss (encourage latent representation to be normal) kl_loss = torch.mean( @@ -174,7 +203,7 @@ def compute_loss( ) vail_loss = self.beta * (kl_loss - self.mutual_information) with torch.no_grad(): - self.beta = torch.max( + self.beta.data = torch.max( self.beta + self.alpha * (kl_loss - self.mutual_information), torch.tensor(0.0), ) @@ -183,7 +212,7 @@ def compute_loss( loss += self.gradient_penalty_weight * self.compute_gradient_magnitude( policy_batch, expert_batch ) - return loss + return loss, torch.mean(policy_estimate), torch.mean(expert_estimate), kl_loss def compute_gradient_magnitude( self, policy_batch: AgentBuffer, expert_batch: AgentBuffer diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index b7138a8547..174307936f 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -73,7 +73,7 @@ def all_log_prob(self): return torch.log(self.probs) def entropy(self): - return torch.sum(self.probs * torch.log(self.probs), dim=-1) + return -torch.sum(self.probs * torch.log(self.probs), dim=-1) class GaussianDistribution(nn.Module): From 5c73799ea3d835897e210f503b7d9422df3e1bd7 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Tue, 4 Aug 2020 10:34:08 -0700 Subject: [PATCH 08/10] addressing comments --- ml-agents/mlagents/trainers/ppo/trainer.py | 8 ++++---- .../reward_providers/curiosity_reward_provider.py | 8 -------- ml-agents/mlagents/trainers/sac/trainer.py | 13 +++++++------ ml-agents/mlagents/trainers/trainer/rl_trainer.py | 6 +++--- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py index a3142b14f8..40dba72d76 100644 --- a/ml-agents/mlagents/trainers/ppo/trainer.py +++ b/ml-agents/mlagents/trainers/ppo/trainer.py @@ -22,6 +22,7 @@ PPOSettings, TestingConfiguration, ) +from mlagents.trainers.components.reward_signals import RewardSignal logger = get_logger(__name__) @@ -77,7 +78,6 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: if self.is_training: self.policy.update_normalization(agent_buffer_trajectory["vector_obs"]) - # Get all value estimates value_estimates, value_next = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, @@ -87,13 +87,13 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: for name, v in value_estimates.items(): agent_buffer_trajectory[f"{name}_value_estimates"].extend(v) - if hasattr(self.optimizer.reward_signals[name], "value_name"): + if isinstance(self.optimizer.reward_signals[name], RewardSignal): self._stats_reporter.add_stat( self.optimizer.reward_signals[name].value_name, np.mean(v) ) else: self._stats_reporter.add_stat( - "Policy/"+self.optimizer.reward_signals[name].name + " Value Estimate", + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", np.mean(v), ) @@ -102,7 +102,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - if hasattr(reward_signal, "evaluate_batch"): + if isinstance(reward_signal, RewardSignal): evaluate_result = reward_signal.evaluate_batch( agent_buffer_trajectory ).scaled_reward diff --git a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py index 754b0fe1ba..df630b5b9c 100644 --- a/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py +++ b/ml-agents/mlagents/trainers/reward_providers/curiosity_reward_provider.py @@ -72,8 +72,6 @@ def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: ModelUtils.SwishLayer(), torch.nn.Linear(256, self._action_flattener.flattened_size), ) - torch.nn.init.xavier_normal_(self.inverse_model_action_predition[0].weight.data) - torch.nn.init.xavier_normal_(self.inverse_model_action_predition[2].weight.data) self.inverse_model_action_predition[0].bias.data.zero_() self.inverse_model_action_predition[2].bias.data.zero_() @@ -84,12 +82,6 @@ def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: ModelUtils.SwishLayer(), torch.nn.Linear(256, settings.encoding_size), ) - torch.nn.init.xavier_normal_( - self.forward_model_next_state_prediction[0].weight.data - ) - torch.nn.init.xavier_normal_( - self.forward_model_next_state_prediction[2].weight.data - ) self.forward_model_next_state_prediction[0].bias.data.zero_() self.forward_model_next_state_prediction[2].bias.data.zero_() diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py index aa789135e7..552857423d 100644 --- a/ml-agents/mlagents/trainers/sac/trainer.py +++ b/ml-agents/mlagents/trainers/sac/trainer.py @@ -21,7 +21,7 @@ from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer from mlagents.trainers.settings import TrainerSettings, SACSettings - +from mlagents.trainers.components.reward_signals import RewardSignal logger = get_logger(__name__) @@ -139,7 +139,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory["environment_rewards"] ) for name, reward_signal in self.optimizer.reward_signals.items(): - if hasattr(reward_signal, "evaluate_batch"): + if isinstance(reward_signal, RewardSignal): evaluate_result = reward_signal.evaluate_batch( agent_buffer_trajectory ).scaled_reward @@ -156,13 +156,14 @@ def _process_trajectory(self, trajectory: Trajectory) -> None: agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached ) for name, v in value_estimates.items(): - if hasattr(self.optimizer.reward_signals[name], "value_name"): + if isinstance(self.optimizer.reward_signals[name], RewardSignal): self._stats_reporter.add_stat( self.optimizer.reward_signals[name].value_name, np.mean(v) ) else: self._stats_reporter.add_stat( - "Policy/" + self.optimizer.reward_signals[name].name + " Value", np.mean(v) + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", + np.mean(v), ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. @@ -284,7 +285,7 @@ def _update_sac_policy(self) -> bool: ) # Get rewards for each reward for name, signal in self.optimizer.reward_signals.items(): - if hasattr(signal, "evaluate_batch"): + if isinstance(signal, RewardSignal): sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch( sampled_minibatch ).scaled_reward @@ -338,7 +339,7 @@ def _update_reward_signals(self) -> None: reward_signal_minibatches = {} for name, signal in self.optimizer.reward_signals.items(): logger.debug(f"Updating {name} at step {self.step}") - if hasattr(signal, "update_dict"): + if isinstance(signal, RewardSignal): # Some signals don't need a minibatch to be sampled - so we don't! if signal.update_dict: reward_signal_minibatches[name] = buffer.sample_mini_batch( diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py index b9eddaa55b..39d69dcef8 100644 --- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py +++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py @@ -15,7 +15,7 @@ from mlagents.trainers.optimizer import Optimizer from mlagents.trainers.buffer import AgentBuffer from mlagents.trainers.trainer import Trainer -from mlagents.trainers.components.reward_signals import RewardSignalResult +from mlagents.trainers.components.reward_signals import RewardSignalResult, RewardSignal from mlagents_envs.timers import hierarchical_timer from mlagents_envs.base_env import BehaviorSpec from mlagents.trainers.policy.policy import Policy @@ -78,14 +78,14 @@ def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None self.reward_buffer.appendleft(rewards.get(agent_id, 0)) rewards[agent_id] = 0 else: - if hasattr(optimizer.reward_signals[name], "stat_name"): + if isinstance(optimizer.reward_signals[name], RewardSignal): self.stats_reporter.add_stat( optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0), ) else: self.stats_reporter.add_stat( - "Policy/"+optimizer.reward_signals[name].name + " Reward", + f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward", rewards.get(agent_id, 0), ) rewards[agent_id] = 0 From 5d0bb3272666c9bf13c8fce838457edc11fc20e5 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Fri, 7 Aug 2020 14:02:17 -0700 Subject: [PATCH 09/10] Enfore float32 for tests --- .../trainers/tests/torch/test_reward_providers/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py index c50ab68a6a..1f50f06a11 100644 --- a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py +++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py @@ -26,7 +26,7 @@ def create_agent_buffer( buffer["vector_obs"].append(curr_split_obs.vector_observations) buffer["next_vector_in"].append(next_split_obs.vector_observations) buffer["actions"].append(action) - buffer["done"].append(np.zeros(1)) - buffer["reward"].append(np.ones(1) * reward) - buffer["masks"].append(np.ones(1)) + buffer["done"].append(np.zeros(1, dtype=np.float32)) + buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) + buffer["masks"].append(np.ones(1, dtype=np.float32)) return buffer From fff745c3059e9c3ffe3ba4d0d3025a3164ec59a3 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Fri, 7 Aug 2020 14:29:14 -0700 Subject: [PATCH 10/10] enfore np.float32 in buffer --- ml-agents/mlagents/trainers/buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py index 87fd160d8f..9b0cf48aaa 100644 --- a/ml-agents/mlagents/trainers/buffer.py +++ b/ml-agents/mlagents/trainers/buffer.py @@ -48,7 +48,7 @@ def extend(self, data: np.ndarray) -> None: Adds a list of np.arrays to the end of the list of np.arrays. :param data: The np.array list to append. """ - self += list(np.array(data)) + self += list(np.array(data, dtype=np.float32)) def set(self, data): """