Remove unused "last reward" logic, TF nodes

At each step, an unused `last_reward` variable in the TF graph is updated in our PPO trainer. There are also related unused methods in various places in the codebase. This change removes them.
Unity-Technologies · Jul 3, 2019 · f6f967a · f6f967a
1 parent f4bc8ef
commit f6f967a
Show file tree

Hide file tree

Showing 7 changed files with 9 additions and 62 deletions.
diff --git a/ml-agents/mlagents/trainers/bc/trainer.py b/ml-agents/mlagents/trainers/bc/trainer.py
@@ -67,20 +67,9 @@ def get_step(self):
         """
         return self.policy.get_current_step()
 
-    @property
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
-        """
-        if len(self.stats["Environment/Cumulative Reward"]) > 0:
-            return np.mean(self.stats["Environment/Cumulative Reward"])
-        else:
-            return 0
-
-    def increment_step_and_update_last_reward(self):
+    def increment_step(self):
         """
-        Increment the step count of the trainer and Updates the last reward
+        Increment the step count of the trainer
         """
         self.policy.increment_step()
         return

diff --git a/ml-agents/mlagents/trainers/ppo/models.py b/ml-agents/mlagents/trainers/ppo/models.py
@@ -45,9 +45,6 @@ def __init__(
         )
         if num_layers < 1:
             num_layers = 1
-        self.last_reward, self.new_reward, self.update_reward = (
-            self.create_reward_encoder()
-        )
         if brain.vector_action_space_type == "continuous":
             self.create_cc_actor_critic(h_size, num_layers)
             self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
@@ -64,16 +61,6 @@ def __init__(
             max_step,
         )
 
-    @staticmethod
-    def create_reward_encoder():
-        """Creates TF ops to track and increment recent average cumulative reward."""
-        last_reward = tf.Variable(
-            0, name="last_reward", trainable=False, dtype=tf.float32
-        )
-        new_reward = tf.placeholder(shape=[], dtype=tf.float32, name="new_reward")
-        update_reward = tf.assign(last_reward, new_reward)
-        return last_reward, new_reward, update_reward
-
     def create_losses(
         self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
     ):

diff --git a/ml-agents/mlagents/trainers/ppo/policy.py b/ml-agents/mlagents/trainers/ppo/policy.py
@@ -221,19 +221,3 @@ def get_action(self, brain_info: BrainInfo) -> ActionInfo:
             value=mean_values,
             outputs=run_out,
         )
-
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
-        """
-        return self.sess.run(self.model.last_reward)
-
-    def update_reward(self, new_reward):
-        """
-        Updates reward value for policy.
-        :param new_reward: New reward to save.
-        """
-        self.sess.run(
-            self.model.update_reward, feed_dict={self.model.new_reward: new_reward}
-        )
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -122,13 +122,10 @@ def reward_buffer(self):
         """
         return self._reward_buffer
 
-    def increment_step_and_update_last_reward(self):
+    def increment_step(self):
         """
-        Increment the step count of the trainer and Updates the last reward
+        Increment the step count of the trainer
         """
-        if self.stats["Environment/Cumulative Reward"]:
-            mean_reward = np.mean(self.stats["Environment/Cumulative Reward"])
-            self.policy.update_reward(mean_reward)
         self.policy.increment_step()
         self.step = self.policy.get_current_step()
 

diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
@@ -425,4 +425,4 @@ def test_take_step_adds_experiences_to_trainer_and_trains():
     )
     trainer_mock.update_policy.assert_called_once()
     trainer_mock.write_summary.assert_called_once()
-    trainer_mock.increment_step_and_update_last_reward.assert_called_once()
+    trainer_mock.increment_step.assert_called_once()
diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py
@@ -109,21 +109,11 @@ def get_step(self):
         """
         raise UnityTrainerException("The get_step property was not implemented.")
 
-    @property
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
-        """
-        raise UnityTrainerException("The get_last_reward property was not implemented.")
-
-    def increment_step_and_update_last_reward(self):
+    def increment_step(self):
         """
-        Increment the step count of the trainer and updates the last reward
+        Increment the step count of the trainer
         """
-        raise UnityTrainerException(
-            "The increment_step_and_update_last_reward method was not implemented."
-        )
+        raise UnityTrainerException("The increment_step method was not implemented.")
 
     def get_action(self, curr_info: BrainInfo) -> ActionInfo:
         """

diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -339,5 +339,5 @@ def take_step(
             else:
                 trainer.write_summary(self.global_step, delta_train_start)
             if self.train_model and trainer.get_step <= trainer.get_max_steps:
-                trainer.increment_step_and_update_last_reward()
+                trainer.increment_step()
         return new_info