fix: fix the distribution of discrete actions (#34)

StepNeverStop · Jan 12, 2021 · 3614ad7 · 3614ad7
1 parent b928cee
commit 3614ad7
Show file tree

Hide file tree

Showing 11 changed files with 12 additions and 12 deletions.
diff --git a/rls/algos/hierarchical/hiro.py b/rls/algos/hierarchical/hiro.py
@@ -282,7 +282,7 @@ def train_low(self, BATCH: Low_BatchExperiences):
                     action_target = target_output
                 else:
                     target_logits = target_output
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/hierarchical/ioc.py b/rls/algos/hierarchical/ioc.py
@@ -145,11 +145,11 @@ def _get_action(self, obs, cell_state, options):
                 a, _ = gaussian_clip_rsample(mu, log_std)
             else:
                 pi = pi / self.boltzmann_temperature
-                dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(pi))  # [B, ]
+                dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
                 a = dist.sample()
             interests = self.interest_net.value_net(feat)  # [B, P]
             op_logits = interests * q  # [B, P] or tf.nn.softmax(q)
-            new_options = tfp.distributions.Categorical(logits=tf.nn.log_softmax(op_logits)).sample()
+            new_options = tfp.distributions.Categorical(logits=op_logits).sample()
         return a, new_options, cell_state
 
     def _target_params_update(self):

diff --git a/rls/algos/hierarchical/oc.py b/rls/algos/hierarchical/oc.py
@@ -147,7 +147,7 @@ def _get_action(self, obs, cell_state, options):
                 a, _ = gaussian_clip_rsample(mu, log_std)
             else:
                 pi = pi / self.boltzmann_temperature
-                dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(pi))  # [B, ]
+                dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
                 a = dist.sample()
             max_options = tf.cast(tf.argmax(q, axis=-1), dtype=tf.int32)  # [B, P] => [B, ]
             if self.use_eps_greedy:

diff --git a/rls/algos/single/curl.py b/rls/algos/single/curl.py
@@ -256,7 +256,7 @@ def train(self, BATCH, isw, cell_state, visual, visual_, pos):
                     target_pi, target_log_pi = squash_rsample(target_mu, target_log_std)
                 else:
                     target_logits = self.actor_net.value_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/ddpg.py b/rls/algos/single/ddpg.py
@@ -139,7 +139,7 @@ def _train(self, BATCH, isw, cell_state):
                     mu = self.ac_net.policy_net(feat)
                 else:
                     target_logits = self.ac_target_net.policy_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/dpg.py b/rls/algos/single/dpg.py
@@ -125,7 +125,7 @@ def _train(self, BATCH, isw, cell_state):
                     mu = self.net.policy_net(feat)
                 else:
                     target_logits = self.net.policy_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/maxsqn.py b/rls/algos/single/maxsqn.py
@@ -83,7 +83,7 @@ def choose_action(self, obs, evaluation=False):
     def _get_action(self, obs, cell_state):
         with tf.device(self.device):
             q, _, cell_state = self.critic_net(obs, cell_state=cell_state)
-            cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(q / self.alpha))
+            cate_dist = tfp.distributions.Categorical(logits=(q / self.alpha))
             pi = cate_dist.sample()
         return tf.argmax(q, axis=1), pi, cell_state
 

diff --git a/rls/algos/single/pd_ddpg.py b/rls/algos/single/pd_ddpg.py
@@ -155,7 +155,7 @@ def _train(self, BATCH, isw, cell_state):
                     mu = self.ac_net.policy_net(feat)
                 else:
                     target_logits = self.ac_target_net.policy_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/sac.py b/rls/algos/single/sac.py
@@ -188,7 +188,7 @@ def train_continuous(self, BATCH, isw, cell_state):
                     entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
 
                     target_logits = self.actor_net.value_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/tac.py b/rls/algos/single/tac.py
@@ -174,7 +174,7 @@ def train(self, BATCH, isw, cell_state):
                     entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
 
                     target_logits = self.actor_net.value_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)

diff --git a/rls/algos/single/td3.py b/rls/algos/single/td3.py
@@ -128,7 +128,7 @@ def _train(self, BATCH, isw, cell_state):
                     mu = self.ac_net.policy_net(feat)
                 else:
                     target_logits = self.ac_target_net.policy_net(feat_)
-                    target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
+                    target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
                     target_pi = target_cate_dist.sample()
                     target_log_pi = target_cate_dist.log_prob(target_pi)
                     action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)