diff --git a/ml-agents/mlagents/trainers/distributions_torch.py b/ml-agents/mlagents/trainers/distributions_torch.py index a8616f65d6..29c683df63 100644 --- a/ml-agents/mlagents/trainers/distributions_torch.py +++ b/ml-agents/mlagents/trainers/distributions_torch.py @@ -42,7 +42,7 @@ def sample(self): return torch.multinomial(self.probs, 1) def pdf(self, value): - return torch.diag(self.probs.T[value.flatten()]) + return torch.diag(self.probs.T[value.flatten().long()]) def log_prob(self, value): return torch.log(self.pdf(value)) diff --git a/ml-agents/mlagents/trainers/policy/policy.py b/ml-agents/mlagents/trainers/policy/policy.py index b1a9d460b2..e3830d5472 100644 --- a/ml-agents/mlagents/trainers/policy/policy.py +++ b/ml-agents/mlagents/trainers/policy/policy.py @@ -32,7 +32,7 @@ def __init__( self.num_branches = len(self.brain.vector_action_space_size) self.previous_action_dict: Dict[str, np.array] = {} self.memory_dict: Dict[str, np.ndarray] = {} - self.normalize = trainer_settings + self.normalize = trainer_settings.network_settings.normalize self.use_recurrent = trainer_settings.network_settings.memory is not None self.model_path = trainer_settings.init_path diff --git a/ml-agents/mlagents/trainers/policy/torch_policy.py b/ml-agents/mlagents/trainers/policy/torch_policy.py index 22c9c7cd86..f16ada01fa 100644 --- a/ml-agents/mlagents/trainers/policy/torch_policy.py +++ b/ml-agents/mlagents/trainers/policy/torch_policy.py @@ -150,8 +150,7 @@ def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1) actions = self.actor_critic.sample_action(dists) log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) - if self.act_type == "continuous": - actions.squeeze_(-1) + actions = torch.squeeze(actions) return actions, log_probs, entropies, value_heads, memories @@ -250,7 +249,7 @@ def export_model(self, step=0): fake_vec_obs = [torch.zeros([1] + [self.brain.vector_observation_space_size])] fake_vis_obs = [torch.zeros([1] + [84, 84, 3])] fake_masks = torch.ones([1] + self.actor_critic.act_size) - fake_memories = torch.zeros([1] + [self.m_size]) + # fake_memories = torch.zeros([1] + [self.m_size]) export_path = "./model-" + str(step) + ".onnx" output_names = ["action", "action_probs"] input_names = ["vector_observation", "action_mask"] diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py index 171facc52a..28fc2e8598 100644 --- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py +++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py @@ -99,7 +99,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: if self.policy.use_continuous_act: actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) else: - actions = torch.as_tensor(batch["actions"]) + actions = torch.as_tensor(batch["actions"], dtype=torch.long) memories = [ torch.as_tensor(batch["memory"][i])