Skip to content

Commit

Permalink
fix: fix the distribution of discrete actions (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
StepNeverStop committed Jan 12, 2021
1 parent b928cee commit 3614ad7
Show file tree
Hide file tree
Showing 11 changed files with 12 additions and 12 deletions.
2 changes: 1 addition & 1 deletion rls/algos/hierarchical/hiro.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def train_low(self, BATCH: Low_BatchExperiences):
action_target = target_output
else:
target_logits = target_output
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
4 changes: 2 additions & 2 deletions rls/algos/hierarchical/ioc.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ def _get_action(self, obs, cell_state, options):
a, _ = gaussian_clip_rsample(mu, log_std)
else:
pi = pi / self.boltzmann_temperature
dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(pi)) # [B, ]
dist = tfp.distributions.Categorical(logits=pi) # [B, ]
a = dist.sample()
interests = self.interest_net.value_net(feat) # [B, P]
op_logits = interests * q # [B, P] or tf.nn.softmax(q)
new_options = tfp.distributions.Categorical(logits=tf.nn.log_softmax(op_logits)).sample()
new_options = tfp.distributions.Categorical(logits=op_logits).sample()
return a, new_options, cell_state

def _target_params_update(self):
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/hierarchical/oc.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _get_action(self, obs, cell_state, options):
a, _ = gaussian_clip_rsample(mu, log_std)
else:
pi = pi / self.boltzmann_temperature
dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(pi)) # [B, ]
dist = tfp.distributions.Categorical(logits=pi) # [B, ]
a = dist.sample()
max_options = tf.cast(tf.argmax(q, axis=-1), dtype=tf.int32) # [B, P] => [B, ]
if self.use_eps_greedy:
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/curl.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def train(self, BATCH, isw, cell_state, visual, visual_, pos):
target_pi, target_log_pi = squash_rsample(target_mu, target_log_std)
else:
target_logits = self.actor_net.value_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _train(self, BATCH, isw, cell_state):
mu = self.ac_net.policy_net(feat)
else:
target_logits = self.ac_target_net.policy_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/dpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def _train(self, BATCH, isw, cell_state):
mu = self.net.policy_net(feat)
else:
target_logits = self.net.policy_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/maxsqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def choose_action(self, obs, evaluation=False):
def _get_action(self, obs, cell_state):
with tf.device(self.device):
q, _, cell_state = self.critic_net(obs, cell_state=cell_state)
cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(q / self.alpha))
cate_dist = tfp.distributions.Categorical(logits=(q / self.alpha))
pi = cate_dist.sample()
return tf.argmax(q, axis=1), pi, cell_state

Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/pd_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def _train(self, BATCH, isw, cell_state):
mu = self.ac_net.policy_net(feat)
else:
target_logits = self.ac_target_net.policy_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def train_continuous(self, BATCH, isw, cell_state):
entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))

target_logits = self.actor_net.value_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/tac.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def train(self, BATCH, isw, cell_state):
entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True))

target_logits = self.actor_net.value_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down
2 changes: 1 addition & 1 deletion rls/algos/single/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _train(self, BATCH, isw, cell_state):
mu = self.ac_net.policy_net(feat)
else:
target_logits = self.ac_target_net.policy_net(feat_)
target_cate_dist = tfp.distributions.Categorical(logits=tf.nn.log_softmax(target_logits))
target_cate_dist = tfp.distributions.Categorical(logits=target_logits)
target_pi = target_cate_dist.sample()
target_log_pi = target_cate_dist.log_prob(target_pi)
action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32)
Expand Down

0 comments on commit 3614ad7

Please sign in to comment.