From 6d729a0a2b2ba1fc946720cdb7871c9be3e38d45 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Mon, 16 Nov 2020 17:16:43 -0800 Subject: [PATCH 1/7] match3 settings --- .../Examples/Match3/Scenes/Match3.unity | 12 ++++++++- config/ppo/Match3.yaml | 26 ++++++++++--------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity index 8383c776d2..767db001c3 100644 --- a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity +++ b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.43632728, g: 0.4747097, b: 0.51471573, a: 1} + m_IndirectSpecularColor: {r: 0.43632758, g: 0.47471005, b: 0.5147158, a: 1} m_UseRadianceAmbientProbe: 0 --- !u!157 &3 LightmapSettings: @@ -690,6 +690,11 @@ PrefabInstance: m_Modification: m_TransformParent: {fileID: 0} m_Modifications: + - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b, + type: 3} + propertyPath: m_BehaviorName + value: Match3GreedyHeuristic + objectReference: {fileID: 0} - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b, type: 3} propertyPath: cubeSpacing @@ -1385,6 +1390,11 @@ PrefabInstance: m_Modification: m_TransformParent: {fileID: 0} m_Modifications: + - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b, + type: 3} + propertyPath: m_BehaviorName + value: Match3GreedyHeuristic + objectReference: {fileID: 0} - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b, type: 3} propertyPath: cubeSpacing diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml index e60a138cfa..f7e847d9f6 100644 --- a/config/ppo/Match3.yaml +++ b/config/ppo/Match3.yaml @@ -2,18 +2,18 @@ behaviors: Match3VectorObs: trainer_type: ppo hyperparameters: - batch_size: 64 - buffer_size: 12000 + batch_size: 16 + buffer_size: 120 learning_rate: 0.0003 - beta: 0.001 + beta: 0.005 epsilon: 0.2 lambd: 0.99 num_epoch: 3 learning_rate_schedule: constant network_settings: normalize: true - hidden_units: 128 - num_layers: 2 + hidden_units: 256 + num_layers: 4 vis_encode_type: match3 reward_signals: extrinsic: @@ -21,24 +21,25 @@ behaviors: strength: 1.0 keep_checkpoints: 5 max_steps: 5000000 - time_horizon: 1000 + time_horizon: 128 summary_freq: 10000 threaded: true + checkpoint_interval: 100000 Match3VisualObs: trainer_type: ppo hyperparameters: - batch_size: 64 - buffer_size: 12000 + batch_size: 16 + buffer_size: 120 learning_rate: 0.0003 - beta: 0.001 + beta: 0.005 epsilon: 0.2 lambd: 0.99 num_epoch: 3 learning_rate_schedule: constant network_settings: normalize: true - hidden_units: 128 - num_layers: 2 + hidden_units: 256 + num_layers: 4 vis_encode_type: match3 reward_signals: extrinsic: @@ -46,9 +47,10 @@ behaviors: strength: 1.0 keep_checkpoints: 5 max_steps: 5000000 - time_horizon: 1000 + time_horizon: 128 summary_freq: 10000 threaded: true + checkpoint_interval: 100000 Match3SimpleHeuristic: # Settings can be very simple since we don't care about actually training the model trainer_type: ppo From 190126923c6cd8d013cee0de6ce20cf5224b2e05 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Mon, 16 Nov 2020 17:19:14 -0800 Subject: [PATCH 2/7] Add epsilon to log --- ml-agents/mlagents/trainers/torch/distributions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index 8540909f6d..68bbd1371b 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -108,13 +108,13 @@ def pdf(self, value): ).squeeze(-1) def log_prob(self, value): - return torch.log(self.pdf(value)) + return torch.log(self.pdf(value) + EPSILON) def all_log_prob(self): - return torch.log(self.probs) + return torch.log(self.probs + EPSILON) def entropy(self): - return -torch.sum(self.probs * torch.log(self.probs), dim=-1) + return -torch.sum(self.probs * torch.log(self.probs + EPSILON), dim=-1) class GaussianDistribution(nn.Module): From 6e29be1810157767dd389f68655a4348d0fe0fa8 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 17 Nov 2020 11:00:32 -0800 Subject: [PATCH 3/7] Add another epsilon --- ml-agents/mlagents/trainers/torch/distributions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index 68bbd1371b..36e3deddbd 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -184,7 +184,9 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask - normalized_probs = raw_probs / torch.sum(raw_probs, dim=-1).unsqueeze(-1) + normalized_probs = raw_probs / ( + torch.sum(raw_probs, dim=-1).unsqueeze(-1) + EPSILON + ) normalized_logits = torch.log(normalized_probs + EPSILON) return normalized_logits From 7b7dbf9693b3ea6995c73c1d8d3f73d5d4d3e732 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 17 Nov 2020 16:09:12 -0800 Subject: [PATCH 4/7] Revert match3 configs --- .../Examples/Match3/Scenes/Match3.unity | 12 +-------- config/ppo/Match3.yaml | 26 +++++++++---------- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity index 767db001c3..8383c776d2 100644 --- a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity +++ b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.43632758, g: 0.47471005, b: 0.5147158, a: 1} + m_IndirectSpecularColor: {r: 0.43632728, g: 0.4747097, b: 0.51471573, a: 1} m_UseRadianceAmbientProbe: 0 --- !u!157 &3 LightmapSettings: @@ -690,11 +690,6 @@ PrefabInstance: m_Modification: m_TransformParent: {fileID: 0} m_Modifications: - - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b, - type: 3} - propertyPath: m_BehaviorName - value: Match3GreedyHeuristic - objectReference: {fileID: 0} - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b, type: 3} propertyPath: cubeSpacing @@ -1390,11 +1385,6 @@ PrefabInstance: m_Modification: m_TransformParent: {fileID: 0} m_Modifications: - - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b, - type: 3} - propertyPath: m_BehaviorName - value: Match3GreedyHeuristic - objectReference: {fileID: 0} - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b, type: 3} propertyPath: cubeSpacing diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml index f7e847d9f6..e60a138cfa 100644 --- a/config/ppo/Match3.yaml +++ b/config/ppo/Match3.yaml @@ -2,18 +2,18 @@ behaviors: Match3VectorObs: trainer_type: ppo hyperparameters: - batch_size: 16 - buffer_size: 120 + batch_size: 64 + buffer_size: 12000 learning_rate: 0.0003 - beta: 0.005 + beta: 0.001 epsilon: 0.2 lambd: 0.99 num_epoch: 3 learning_rate_schedule: constant network_settings: normalize: true - hidden_units: 256 - num_layers: 4 + hidden_units: 128 + num_layers: 2 vis_encode_type: match3 reward_signals: extrinsic: @@ -21,25 +21,24 @@ behaviors: strength: 1.0 keep_checkpoints: 5 max_steps: 5000000 - time_horizon: 128 + time_horizon: 1000 summary_freq: 10000 threaded: true - checkpoint_interval: 100000 Match3VisualObs: trainer_type: ppo hyperparameters: - batch_size: 16 - buffer_size: 120 + batch_size: 64 + buffer_size: 12000 learning_rate: 0.0003 - beta: 0.005 + beta: 0.001 epsilon: 0.2 lambd: 0.99 num_epoch: 3 learning_rate_schedule: constant network_settings: normalize: true - hidden_units: 256 - num_layers: 4 + hidden_units: 128 + num_layers: 2 vis_encode_type: match3 reward_signals: extrinsic: @@ -47,10 +46,9 @@ behaviors: strength: 1.0 keep_checkpoints: 5 max_steps: 5000000 - time_horizon: 128 + time_horizon: 1000 summary_freq: 10000 threaded: true - checkpoint_interval: 100000 Match3SimpleHeuristic: # Settings can be very simple since we don't care about actually training the model trainer_type: ppo From 22800d7688afcc07a616459ec5df99958ff55ee7 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Tue, 17 Nov 2020 19:00:55 -0800 Subject: [PATCH 5/7] NaN-free masking method --- ml-agents/mlagents/trainers/torch/distributions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index 36e3deddbd..07aea5870c 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -183,12 +183,12 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: return nn.ModuleList(branches) def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: - raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask - normalized_probs = raw_probs / ( - torch.sum(raw_probs, dim=-1).unsqueeze(-1) + EPSILON - ) - normalized_logits = torch.log(normalized_probs + EPSILON) - return normalized_logits + # Zero out masked logits, then subtract a large value + flipped_mask = 1.0 - mask + adj_logits = logits * mask - 1e8 * flipped_mask + probs = torch.nn.functional.softmax(adj_logits, dim=-1) + log_probs = torch.log(probs + EPSILON) + return log_probs def _split_masks(self, masks: torch.Tensor) -> List[torch.Tensor]: split_masks = [] From eafb0e8ccc6f38e705e0e1fe719ebbe5b1cf9131 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 18 Nov 2020 11:49:27 -0800 Subject: [PATCH 6/7] Add comment for paper --- ml-agents/mlagents/trainers/torch/distributions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index 07aea5870c..673e7e340f 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -183,7 +183,8 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: return nn.ModuleList(branches) def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: - # Zero out masked logits, then subtract a large value + # Zero out masked logits, then subtract a large value. Technique mentioend here: + # https://arxiv.org/abs/2006.14171 flipped_mask = 1.0 - mask adj_logits = logits * mask - 1e8 * flipped_mask probs = torch.nn.functional.softmax(adj_logits, dim=-1) From 9de1525ea1adf19ee6f09097bda4a74a6b9d56d0 Mon Sep 17 00:00:00 2001 From: Ervin Teng Date: Wed, 18 Nov 2020 11:50:04 -0800 Subject: [PATCH 7/7] Add comment for paper --- ml-agents/mlagents/trainers/torch/distributions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py index 673e7e340f..5778549061 100644 --- a/ml-agents/mlagents/trainers/torch/distributions.py +++ b/ml-agents/mlagents/trainers/torch/distributions.py @@ -183,8 +183,8 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: return nn.ModuleList(branches) def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: - # Zero out masked logits, then subtract a large value. Technique mentioend here: - # https://arxiv.org/abs/2006.14171 + # Zero out masked logits, then subtract a large value. Technique mentionend here: + # https://arxiv.org/abs/2006.14171. Our implementation is ONNX and Barrcuda-friendly. flipped_mask = 1.0 - mask adj_logits = logits * mask - 1e8 * flipped_mask probs = torch.nn.functional.softmax(adj_logits, dim=-1)