From 6d729a0a2b2ba1fc946720cdb7871c9be3e38d45 Mon Sep 17 00:00:00 2001
From: Chris Elion <chris.elion@unity3d.com>
Date: Mon, 16 Nov 2020 17:16:43 -0800
Subject: [PATCH 1/7] match3 settings

---
 .../Examples/Match3/Scenes/Match3.unity       | 12 ++++++++-
 config/ppo/Match3.yaml                        | 26 ++++++++++---------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
index 8383c776d2..767db001c3 100644
--- a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
+++ b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
@@ -38,7 +38,7 @@ RenderSettings:
   m_ReflectionIntensity: 1
   m_CustomReflection: {fileID: 0}
   m_Sun: {fileID: 0}
-  m_IndirectSpecularColor: {r: 0.43632728, g: 0.4747097, b: 0.51471573, a: 1}
+  m_IndirectSpecularColor: {r: 0.43632758, g: 0.47471005, b: 0.5147158, a: 1}
   m_UseRadianceAmbientProbe: 0
 --- !u!157 &3
 LightmapSettings:
@@ -690,6 +690,11 @@ PrefabInstance:
   m_Modification:
     m_TransformParent: {fileID: 0}
     m_Modifications:
+    - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b,
+        type: 3}
+      propertyPath: m_BehaviorName
+      value: Match3GreedyHeuristic
+      objectReference: {fileID: 0}
     - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b,
         type: 3}
       propertyPath: cubeSpacing
@@ -1385,6 +1390,11 @@ PrefabInstance:
   m_Modification:
     m_TransformParent: {fileID: 0}
     m_Modifications:
+    - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b,
+        type: 3}
+      propertyPath: m_BehaviorName
+      value: Match3GreedyHeuristic
+      objectReference: {fileID: 0}
     - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b,
         type: 3}
       propertyPath: cubeSpacing
diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml
index e60a138cfa..f7e847d9f6 100644
--- a/config/ppo/Match3.yaml
+++ b/config/ppo/Match3.yaml
@@ -2,18 +2,18 @@ behaviors:
   Match3VectorObs:
     trainer_type: ppo
     hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
+      batch_size: 16
+      buffer_size: 120
       learning_rate: 0.0003
-      beta: 0.001
+      beta: 0.005
       epsilon: 0.2
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: constant
     network_settings:
       normalize: true
-      hidden_units: 128
-      num_layers: 2
+      hidden_units: 256
+      num_layers: 4
       vis_encode_type: match3
     reward_signals:
       extrinsic:
@@ -21,24 +21,25 @@ behaviors:
         strength: 1.0
     keep_checkpoints: 5
     max_steps: 5000000
-    time_horizon: 1000
+    time_horizon: 128
     summary_freq: 10000
     threaded: true
+    checkpoint_interval: 100000
   Match3VisualObs:
     trainer_type: ppo
     hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
+      batch_size: 16
+      buffer_size: 120
       learning_rate: 0.0003
-      beta: 0.001
+      beta: 0.005
       epsilon: 0.2
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: constant
     network_settings:
       normalize: true
-      hidden_units: 128
-      num_layers: 2
+      hidden_units: 256
+      num_layers: 4
       vis_encode_type: match3
     reward_signals:
       extrinsic:
@@ -46,9 +47,10 @@ behaviors:
         strength: 1.0
     keep_checkpoints: 5
     max_steps: 5000000
-    time_horizon: 1000
+    time_horizon: 128
     summary_freq: 10000
     threaded: true
+    checkpoint_interval: 100000
   Match3SimpleHeuristic:
     # Settings can be very simple since we don't care about actually training the model
     trainer_type: ppo

From 190126923c6cd8d013cee0de6ce20cf5224b2e05 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Mon, 16 Nov 2020 17:19:14 -0800
Subject: [PATCH 2/7] Add epsilon to log

---
 ml-agents/mlagents/trainers/torch/distributions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 8540909f6d..68bbd1371b 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -108,13 +108,13 @@ def pdf(self, value):
         ).squeeze(-1)
 
     def log_prob(self, value):
-        return torch.log(self.pdf(value))
+        return torch.log(self.pdf(value) + EPSILON)
 
     def all_log_prob(self):
-        return torch.log(self.probs)
+        return torch.log(self.probs + EPSILON)
 
     def entropy(self):
-        return -torch.sum(self.probs * torch.log(self.probs), dim=-1)
+        return -torch.sum(self.probs * torch.log(self.probs + EPSILON), dim=-1)
 
 
 class GaussianDistribution(nn.Module):

From 6e29be1810157767dd389f68655a4348d0fe0fa8 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Tue, 17 Nov 2020 11:00:32 -0800
Subject: [PATCH 3/7] Add another epsilon

---
 ml-agents/mlagents/trainers/torch/distributions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 68bbd1371b..36e3deddbd 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -184,7 +184,9 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
 
     def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask
-        normalized_probs = raw_probs / torch.sum(raw_probs, dim=-1).unsqueeze(-1)
+        normalized_probs = raw_probs / (
+            torch.sum(raw_probs, dim=-1).unsqueeze(-1) + EPSILON
+        )
         normalized_logits = torch.log(normalized_probs + EPSILON)
         return normalized_logits
 

From 7b7dbf9693b3ea6995c73c1d8d3f73d5d4d3e732 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Tue, 17 Nov 2020 16:09:12 -0800
Subject: [PATCH 4/7] Revert match3 configs

---
 .../Examples/Match3/Scenes/Match3.unity       | 12 +--------
 config/ppo/Match3.yaml                        | 26 +++++++++----------
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
index 767db001c3..8383c776d2 100644
--- a/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
+++ b/Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
@@ -38,7 +38,7 @@ RenderSettings:
   m_ReflectionIntensity: 1
   m_CustomReflection: {fileID: 0}
   m_Sun: {fileID: 0}
-  m_IndirectSpecularColor: {r: 0.43632758, g: 0.47471005, b: 0.5147158, a: 1}
+  m_IndirectSpecularColor: {r: 0.43632728, g: 0.4747097, b: 0.51471573, a: 1}
   m_UseRadianceAmbientProbe: 0
 --- !u!157 &3
 LightmapSettings:
@@ -690,11 +690,6 @@ PrefabInstance:
   m_Modification:
     m_TransformParent: {fileID: 0}
     m_Modifications:
-    - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b,
-        type: 3}
-      propertyPath: m_BehaviorName
-      value: Match3GreedyHeuristic
-      objectReference: {fileID: 0}
     - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b,
         type: 3}
       propertyPath: cubeSpacing
@@ -1390,11 +1385,6 @@ PrefabInstance:
   m_Modification:
     m_TransformParent: {fileID: 0}
     m_Modifications:
-    - target: {fileID: 3508723250470608010, guid: 2fafdcd0587684641b03b11f04454f1b,
-        type: 3}
-      propertyPath: m_BehaviorName
-      value: Match3GreedyHeuristic
-      objectReference: {fileID: 0}
     - target: {fileID: 3508723250470608011, guid: 2fafdcd0587684641b03b11f04454f1b,
         type: 3}
       propertyPath: cubeSpacing
diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml
index f7e847d9f6..e60a138cfa 100644
--- a/config/ppo/Match3.yaml
+++ b/config/ppo/Match3.yaml
@@ -2,18 +2,18 @@ behaviors:
   Match3VectorObs:
     trainer_type: ppo
     hyperparameters:
-      batch_size: 16
-      buffer_size: 120
+      batch_size: 64
+      buffer_size: 12000
       learning_rate: 0.0003
-      beta: 0.005
+      beta: 0.001
       epsilon: 0.2
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: constant
     network_settings:
       normalize: true
-      hidden_units: 256
-      num_layers: 4
+      hidden_units: 128
+      num_layers: 2
       vis_encode_type: match3
     reward_signals:
       extrinsic:
@@ -21,25 +21,24 @@ behaviors:
         strength: 1.0
     keep_checkpoints: 5
     max_steps: 5000000
-    time_horizon: 128
+    time_horizon: 1000
     summary_freq: 10000
     threaded: true
-    checkpoint_interval: 100000
   Match3VisualObs:
     trainer_type: ppo
     hyperparameters:
-      batch_size: 16
-      buffer_size: 120
+      batch_size: 64
+      buffer_size: 12000
       learning_rate: 0.0003
-      beta: 0.005
+      beta: 0.001
       epsilon: 0.2
       lambd: 0.99
       num_epoch: 3
       learning_rate_schedule: constant
     network_settings:
       normalize: true
-      hidden_units: 256
-      num_layers: 4
+      hidden_units: 128
+      num_layers: 2
       vis_encode_type: match3
     reward_signals:
       extrinsic:
@@ -47,10 +46,9 @@ behaviors:
         strength: 1.0
     keep_checkpoints: 5
     max_steps: 5000000
-    time_horizon: 128
+    time_horizon: 1000
     summary_freq: 10000
     threaded: true
-    checkpoint_interval: 100000
   Match3SimpleHeuristic:
     # Settings can be very simple since we don't care about actually training the model
     trainer_type: ppo

From 22800d7688afcc07a616459ec5df99958ff55ee7 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Tue, 17 Nov 2020 19:00:55 -0800
Subject: [PATCH 5/7] NaN-free masking method

---
 ml-agents/mlagents/trainers/torch/distributions.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 36e3deddbd..07aea5870c 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -183,12 +183,12 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
         return nn.ModuleList(branches)
 
     def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask
-        normalized_probs = raw_probs / (
-            torch.sum(raw_probs, dim=-1).unsqueeze(-1) + EPSILON
-        )
-        normalized_logits = torch.log(normalized_probs + EPSILON)
-        return normalized_logits
+        # Zero out masked logits, then subtract a large value
+        flipped_mask = 1.0 - mask
+        adj_logits = logits * mask - 1e8 * flipped_mask
+        probs = torch.nn.functional.softmax(adj_logits, dim=-1)
+        log_probs = torch.log(probs + EPSILON)
+        return log_probs
 
     def _split_masks(self, masks: torch.Tensor) -> List[torch.Tensor]:
         split_masks = []

From eafb0e8ccc6f38e705e0e1fe719ebbe5b1cf9131 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 18 Nov 2020 11:49:27 -0800
Subject: [PATCH 6/7] Add comment for paper

---
 ml-agents/mlagents/trainers/torch/distributions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 07aea5870c..673e7e340f 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -183,7 +183,8 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
         return nn.ModuleList(branches)
 
     def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # Zero out masked logits, then subtract a large value
+        # Zero out masked logits, then subtract a large value. Technique mentioend here:
+        # https://arxiv.org/abs/2006.14171
         flipped_mask = 1.0 - mask
         adj_logits = logits * mask - 1e8 * flipped_mask
         probs = torch.nn.functional.softmax(adj_logits, dim=-1)

From 9de1525ea1adf19ee6f09097bda4a74a6b9d56d0 Mon Sep 17 00:00:00 2001
From: Ervin Teng <ervin@unity3d.com>
Date: Wed, 18 Nov 2020 11:50:04 -0800
Subject: [PATCH 7/7] Add comment for paper

---
 ml-agents/mlagents/trainers/torch/distributions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml-agents/mlagents/trainers/torch/distributions.py b/ml-agents/mlagents/trainers/torch/distributions.py
index 673e7e340f..5778549061 100644
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
@@ -183,8 +183,8 @@ def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
         return nn.ModuleList(branches)
 
     def _mask_branch(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # Zero out masked logits, then subtract a large value. Technique mentioend here:
-        # https://arxiv.org/abs/2006.14171
+        # Zero out masked logits, then subtract a large value. Technique mentionend here:
+        # https://arxiv.org/abs/2006.14171. Our implementation is ONNX and Barrcuda-friendly.
         flipped_mask = 1.0 - mask
         adj_logits = logits * mask - 1e8 * flipped_mask
         probs = torch.nn.functional.softmax(adj_logits, dim=-1)