diff --git a/ml-agents/mlagents/trainers/models_torch.py b/ml-agents/mlagents/trainers/models_torch.py index 2832da06a9..434634fbce 100644 --- a/ml-agents/mlagents/trainers/models_torch.py +++ b/ml-agents/mlagents/trainers/models_torch.py @@ -1,8 +1,9 @@ from enum import Enum -from typing import Callable, NamedTuple +from typing import Callable, NamedTuple, List, Optional import torch from torch import nn +import numpy as np from mlagents.trainers.distributions_torch import ( GaussianDistribution, @@ -19,6 +20,16 @@ EPSILON = 1e-7 +def list_to_tensor( + ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = None +) -> torch.Tensor: + """ + Converts a list of numpy arrays into a tensor. MUCH faster than + calling as_tensor on the list directly. + """ + return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype) + + class ActionType(Enum): DISCRETE = "discrete" CONTINUOUS = "continuous" diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py index a5aad77c94..762ad00357 100644 --- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py +++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py @@ -12,6 +12,7 @@ from mlagents.trainers.optimizer import Optimizer from mlagents.trainers.settings import TrainerSettings, RewardSignalType from mlagents.trainers.trajectory import SplitObservations +from mlagents.trainers.models_torch import list_to_tensor class TorchOptimizer(Optimizer): # pylint: disable=W0223 @@ -79,13 +80,13 @@ def get_value_estimates( def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: - vector_obs = [torch.as_tensor(batch["vector_obs"])] + vector_obs = [list_to_tensor(batch["vector_obs"])] if self.policy.use_vis_obs: visual_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_encoders ): - visual_ob = torch.as_tensor(batch["visual_obs%d" % idx]) + visual_ob = list_to_tensor(batch["visual_obs%d" % idx]) visual_obs.append(visual_ob) else: visual_obs = [] @@ -93,7 +94,7 @@ def get_trajectory_value_estimates( memory = torch.zeros([1, len(vector_obs[0]), self.policy.m_size]) next_obs = np.concatenate(next_obs, axis=-1) - next_obs = [torch.as_tensor(next_obs).unsqueeze(0)] + next_obs = [list_to_tensor(next_obs).unsqueeze(0)] next_memory = torch.zeros([1, 1, self.policy.m_size]) value_estimates, mean_value = self.policy.actor_critic.critic_pass( diff --git a/ml-agents/mlagents/trainers/policy/nn_policy.py b/ml-agents/mlagents/trainers/policy/nn_policy.py index ce08f06f0a..ea550cb652 100644 --- a/ml-agents/mlagents/trainers/policy/nn_policy.py +++ b/ml-agents/mlagents/trainers/policy/nn_policy.py @@ -21,7 +21,6 @@ def __init__( seed: int, brain: BrainParameters, trainer_settings: TrainerSettings, - is_training: bool, model_path: str, load: bool, tanh_squash: bool = False, diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 28fc2e8598..2e8ece92dc 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -7,6 +7,7 @@ from mlagents.trainers.policy.torch_policy import TorchPolicy from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer from mlagents.trainers.settings import TrainerSettings, PPOSettings +from mlagents.trainers.models_torch import list_to_tensor class TorchPPOOptimizer(TorchOptimizer): @@ -91,18 +92,18 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: returns = {} old_values = {} for name in self.reward_signals: - old_values[name] = torch.as_tensor(batch["{}_value_estimates".format(name)]) - returns[name] = torch.as_tensor(batch["{}_returns".format(name)]) + old_values[name] = list_to_tensor(batch["{}_value_estimates".format(name)]) + returns[name] = list_to_tensor(batch["{}_returns".format(name)]) - vec_obs = [torch.as_tensor(batch["vector_obs"])] - act_masks = torch.as_tensor(batch["action_mask"]) + vec_obs = [list_to_tensor(batch["vector_obs"])] + act_masks = list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: - actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) + actions = list_to_tensor(batch["actions"]).unsqueeze(-1) else: - actions = torch.as_tensor(batch["actions"], dtype=torch.long) + actions = list_to_tensor(batch["actions"], dtype=torch.long) memories = [ - torch.as_tensor(batch["memory"][i]) + list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] if len(memories) > 0: @@ -113,7 +114,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_encoders ): - vis_ob = torch.as_tensor(batch["visual_obs%d" % idx]) + vis_ob = list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] @@ -127,10 +128,10 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: ) value_loss = self.ppo_value_loss(values, old_values, returns) policy_loss = self.ppo_policy_loss( - torch.as_tensor(batch["advantages"]), + list_to_tensor(batch["advantages"]), log_probs, - torch.as_tensor(batch["action_probs"]), - torch.as_tensor(batch["masks"], dtype=torch.int32), + list_to_tensor(batch["action_probs"]), + list_to_tensor(batch["masks"], dtype=torch.int32), ) loss = ( policy_loss