v4.1.3 fix: fixed off-policy with rnn training. (#43, #34)

1. optimized `iTensor_oNumpy` 2. renamed `train_time_step` to `rnn_time_steps`, `burn_in_time_step` to `burn_in_time_steps` 3. optimized `on_policy_buffer.py` 4. optimized `EpisodeExperienceReplay` 5. fixed off-policy rnn training 6. optimized&fixed `to_numpy` and `to_tensor` 7. reimplemented `call` and invoking it in `__call__`
StepNeverStop · Jul 29, 2021 · 964f78b · 964f78b
1 parent f95a510
commit 964f78b
Show file tree

Hide file tree

Showing 53 changed files with 422 additions and 334 deletions.
diff --git a/rls/_metadata.py b/rls/_metadata.py
@@ -8,7 +8,7 @@
 # We follow Semantic Versioning (https://semver.org/)
 _MAJOR_VERSION = '4'
 _MINOR_VERSION = '1'
-_PATCH_VERSION = '2'
+_PATCH_VERSION = '3'
 
 # Example: '0.4.2'
 __version__ = '.'.join([_MAJOR_VERSION, _MINOR_VERSION, _PATCH_VERSION])
diff --git a/rls/algorithms/base/base.py b/rls/algorithms/base/base.py
@@ -112,7 +112,7 @@ def write_summaries(self,
                         summaries: Dict = {},
                         writer: Optional[SummaryWriter] = None) -> NoReturn:
         '''
-        write tf summaries showing in tensorboard.
+        write summaries showing in tensorboard.
         '''
         if not self.no_save:
             writer = writer or self.writer

diff --git a/rls/algorithms/base/ma_off_policy.py b/rls/algorithms/base/ma_off_policy.py
@@ -22,8 +22,8 @@ def __init__(self,
                  envspecs,
                  buffer_size=10000,
                  n_step=1,
-                 burn_in_time_step=10,
-                 train_time_step=10,
+                 burn_in_time_steps=10,
+                 rnn_time_steps=10,
                  episode_batch_size=32,
                  episode_buffer_size=10000,
                  train_times_per_step=1,
@@ -32,8 +32,8 @@ def __init__(self,
 
         self.buffer_size = buffer_size
         self.n_step = n_step
-        self.burn_in_time_step = burn_in_time_step
-        self.train_time_step = train_time_step
+        self.burn_in_time_steps = burn_in_time_steps
+        self.rnn_time_steps = rnn_time_steps
         self.episode_batch_size = episode_batch_size
         self.episode_buffer_size = episode_buffer_size
         self.train_times_per_step = train_times_per_step
@@ -48,8 +48,8 @@ def initialize_data_buffer(self) -> NoReturn:
         #     _buffer_args.update(
         #         batch_size=self.episode_batch_size,
         #         capacity=self.episode_buffer_size,
-        #         burn_in_time_step=self.burn_in_time_step,
-        #         train_time_step=self.train_time_step,
+        #         burn_in_time_steps=self.burn_in_time_steps,
+        #         rnn_time_steps=self.rnn_time_steps,
         #         n_copys=self.n_copys
         #     )
         # else:

diff --git a/rls/algorithms/base/ma_policy.py b/rls/algorithms/base/ma_policy.py
@@ -100,7 +100,7 @@ def write_summaries(self,
                         summaries: Dict,
                         writer=None) -> NoReturn:
         '''
-        write tf summaries showing in tensorboard.
+        write summaries showing in tensorboard.
         '''
         if 'model' in summaries.keys():  
             super().write_summaries(global_step, summaries=summaries.pop('model'), writer=self.writer)

diff --git a/rls/algorithms/base/off_policy.py b/rls/algorithms/base/off_policy.py
@@ -15,6 +15,7 @@
 from rls.algorithms.base.policy import Policy
 from rls.common.yaml_ops import load_config
 from rls.common.specs import BatchExperiences
+from rls.common.decorator import iTensor_oNumpy
 
 
 class Off_Policy(Policy):
@@ -24,8 +25,8 @@ def __init__(self,
                  n_step=1,
                  use_priority=False,
                  use_isw=False,
-                 burn_in_time_step=10,
-                 train_time_step=10,
+                 burn_in_time_steps=10,
+                 rnn_time_steps=10,
                  episode_batch_size=32,
                  episode_buffer_size=10000,
                  train_times_per_step=1,
@@ -35,8 +36,8 @@ def __init__(self,
         self.n_step = n_step
         self.use_priority = use_priority
         self.use_isw = use_isw
-        self.burn_in_time_step = burn_in_time_step
-        self.train_time_step = train_time_step
+        self.burn_in_time_steps = burn_in_time_steps
+        self.rnn_time_steps = rnn_time_steps
         self.episode_batch_size = episode_batch_size
         self.episode_buffer_size = episode_buffer_size
         self.train_times_per_step = train_times_per_step
@@ -51,8 +52,8 @@ def initialize_data_buffer(self) -> NoReturn:
             _buffer_args.update(
                 batch_size=self.episode_batch_size,
                 capacity=self.episode_buffer_size,
-                burn_in_time_step=self.burn_in_time_step,
-                train_time_step=self.train_time_step,
+                burn_in_time_steps=self.burn_in_time_steps,
+                rnn_time_steps=self.rnn_time_steps,
                 n_copys=self.n_copys
             )
         else:
@@ -134,17 +135,18 @@ def _learn(self, function_dict: Dict = {}) -> NoReturn:
         if self.data.can_sample:
             self.intermediate_variable_reset()
             data = self.get_transitions()
-            cell_states = {}
+            cell_states = {'obs': self.initial_cell_state(batch=self.episode_batch_size),
+                           'obs_': self.initial_cell_state(batch=self.episode_batch_size)}
 
             # --------------------------------------burn in隐状态部分
-            cell_states['obs'] = self.initial_cell_state(batch=self.episode_batch_size)
-            cell_states['obs_'] = self.initial_cell_state(batch=self.episode_batch_size)
-            if self.use_rnn and self.burn_in_time_step > 0:
+            if self.use_rnn and self.burn_in_time_steps > 0:
                 _burn_in_data = self.get_burn_in_transitions()
-                _, cell_states['obs'] = self.rep_net(obs=_burn_in_data.obs,
-                                                     cell_state=cell_states['obs'])
-                _, cell_states['obs_'] = self.rep_net(obs=_burn_in_data.obs_,
-                                                      cell_state=cell_states['obs_'])
+                _, cell_states['obs'] = iTensor_oNumpy(self.rep_net)(obs=_burn_in_data.obs,
+                                                                     cell_state=cell_states['obs'],
+                                                                     device=self.device)
+                _, cell_states['obs_'] = iTensor_oNumpy(self.rep_net)(obs=_burn_in_data.obs_,
+                                                                      cell_state=cell_states['obs_'],
+                                                                      device=self.device)
             # --------------------------------------
 
             # --------------------------------------好奇心部分

diff --git a/rls/algorithms/base/on_policy.py b/rls/algorithms/base/on_policy.py
@@ -18,14 +18,14 @@
 class On_Policy(Policy):
     def __init__(self,
                  envspec,
-                 rnn_time_step=8,
+                 rnn_time_steps=8,
                  **kwargs):
         super().__init__(envspec=envspec, **kwargs)
-        self.rnn_time_step = rnn_time_step
+        self.rnn_time_steps = rnn_time_steps
 
     def initialize_data_buffer(self, store_data_type=BatchExperiences, sample_data_type=BatchExperiences) -> NoReturn:
-        self.data = DataBuffer(n_copys=self.n_copys, rnn_cell_nums=self.rnn_cell_nums,
-                               batch_size=self.batch_size, rnn_time_step=self.rnn_time_step,
+        self.data = DataBuffer(n_copys=self.n_copys,
+                               batch_size=self.batch_size, rnn_time_steps=self.rnn_time_steps,
                                store_data_type=store_data_type, sample_data_type=sample_data_type)
 
     def store_data(self, exps: BatchExperiences) -> NoReturn:
@@ -58,7 +58,8 @@ def _learn(self, function_dict: Dict) -> NoReturn:
 
         if self.use_curiosity and not self.use_rnn:
             curiosity_data = self.data.get_curiosity_data()
-            cell_states['obs'] = cell_states['obs_'] = self.initial_cell_state(batch=self.n_copys)
+            cell_states = {'obs': self.initial_cell_state(batch=self.n_copys),
+                           'obs_': self.initial_cell_state(batch=self.n_copys)}
             crsty_r, crsty_summaries = self.curiosity_model(curiosity_data, cell_states)
             self.data.update_reward(crsty_r.numpy())
             # self.data.r += crsty_r.numpy().reshape([self.data.eps_len, -1])

diff --git a/rls/algorithms/base/policy.py b/rls/algorithms/base/policy.py
@@ -95,13 +95,6 @@ def reset(self) -> NoReturn:
         '''reset model for each new episode.'''
         self.cell_state = self.next_cell_state = self.initial_cell_state(batch=self.n_copys)
 
-    @property
-    def rnn_cell_nums(self):
-        if self.use_rnn:
-            return self.rep_net.memory_net.cell_nums
-        else:
-            return 0
-
     def initial_cell_state(self, batch: int) -> Tuple[t.Tensor]:
         if self.use_rnn:
             return self.rep_net.memory_net.initial_cell_state(batch=batch)

diff --git a/rls/algorithms/hierarchical/aoc.py b/rls/algorithms/hierarchical/aoc.py
@@ -13,7 +13,6 @@
                               BatchExperiences)
 from rls.nn.models import AocShare
 from rls.nn.utils import OPLR
-from rls.utils.converter import to_numpy
 from rls.common.decorator import iTensor_oNumpy
 
 
@@ -81,7 +80,7 @@ def __init__(self,
         self.kl_reverse = kl_reverse
         self.kl_target = kl_target
         self.kl_alpha = kl_alpha
-        self.kl_coef = t.tensor(kl_coef).float()
+        self.kl_coef = kl_coef
 
         self.kl_cutoff = kl_target * kl_target_cutoff
         self.kl_stop = kl_target * kl_target_earlystop
@@ -110,8 +109,8 @@ def __init__(self,
 
         self.initialize_data_buffer(store_data_type=AOC_Store_BatchExperiences,
                                     sample_data_type=AOC_Train_BatchExperiences)
-        self.oc_mask = t.tensor(np.zeros(self.n_copys)).int()
-        self.options = t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
+        self.oc_mask = np.zeros(self.n_copys)
+        self.options = np.random.randint(0, self.options_num, self.n_copys)
 
     def reset(self):
         super().reset()
@@ -121,18 +120,23 @@ def partial_reset(self, done):
         super().partial_reset(done)
         self._done_mask = done
 
-    @iTensor_oNumpy
     def __call__(self, obs, evaluation=False):
         self.last_options = self.options
+        rets = self.call(obs, cell_state=self.cell_state, options=self.options, _done_mask=self._done_mask)
+        action, self.next_cell_state, self._value, self._log_prob, self._beta_adv, self.oc_mask, self.options = rets
+        self._done_mask = np.full(self.n_copys, False)
+        return action
 
-        feat, self.next_cell_state = self.rep_net(obs, cell_state=self.cell_state)  # [B, P], [B, P, A], [B, P], [B, P]
+    @iTensor_oNumpy
+    def call(self, obs, cell_state, options, _done_mask):
+        feat, cell_state = self.rep_net(obs, cell_state=cell_state)  # [B, P], [B, P, A], [B, P], [B, P]
         (q, pi, beta) = self.net(feat)
-        options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float()    # [B, P]
+        options_onehot = t.nn.functional.one_hot(options, self.options_num).float()    # [B, P]
         options_onehot_expanded = options_onehot.unsqueeze(-1)  # [B, P, 1]
         pi = (pi * options_onehot_expanded).sum(1)  # [B, A]
         if self.is_continuous:
             mu = pi
-            log_std = self.log_std[self.options]
+            log_std = self.log_std[options]
             dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
             sample_op = dist.sample().clamp(-1, 1)
             log_prob = dist.log_prob(sample_op).unsqueeze(-1)
@@ -146,16 +150,9 @@ def __call__(self, obs, evaluation=False):
         max_options = q.argmax(-1)  # [B, P] => [B, ]
         beta_probs = (beta * options_onehot).sum(1)   # [B, P] => [B,]
         beta_dist = td.Bernoulli(probs=beta_probs)
-        new_options = t.where(beta_dist.sample() < 1, self.options, max_options)    # <1 则不改变op， =1 则改变op
-
-        new_options = t.where(self._done_mask, max_options, new_options)
-        self._done_mask = np.full(self.n_copys, False)
-        self._value = to_numpy(value)
-        self._log_prob = to_numpy(log_prob) + np.finfo(np.float32).eps
-        self._beta_adv = to_numpy(beta_adv) + self.dc
-        self.oc_mask = to_numpy(new_options == self.options)  # equal means no change
-        self.options = to_numpy(new_options)
-        return sample_op
+        new_options = t.where(beta_dist.sample() < 1, options, max_options)    # <1 则不改变op， =1 则改变op
+        new_options = t.where(_done_mask, max_options, new_options)
+        return sample_op, cell_state, value, log_prob+t.finfo().eps, beta_adv+self.dc, new_options == options, new_options
 
     def store_data(self, exps: BatchExperiences):
         # self._running_average()
@@ -188,7 +185,7 @@ def learn(self, **kwargs):
         def _train(data, cell_state):
             early_step = 0
             for i in range(self.epoch):
-                loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train(data, cell_state, self.kl_coef)
+                loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train(data, cell_state)
                 if kl > self.kl_stop:
                     early_step = i
                     break
@@ -219,7 +216,7 @@ def _train(data, cell_state):
         })
 
     @iTensor_oNumpy
-    def train(self, BATCH, cell_states, kl_coef):
+    def train(self, BATCH, cell_states):
         last_options = BATCH.last_options  # [B,]
         options = BATCH.options
         feat, _ = self.rep_net(BATCH.obs, cell_state=cell_states['obs'])  # [B, P], [B, P, A], [B, P], [B, P]
@@ -259,7 +256,7 @@ def train(self, BATCH, cell_states, kl_coef):
             surrogate,
             ratio.clamp(1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv
         ).mean()
-        kl_loss = kl_coef * kl
+        kl_loss = self.kl_coef * kl
         extra_loss = 1000.0 * t.maximum(t.zeros_like(kl), kl - self.kl_cutoff).square()
         pi_loss = pi_loss + kl_loss + extra_loss
         q_loss = 0.5 * td_square.mean()

diff --git a/rls/algorithms/hierarchical/hiro.py b/rls/algorithms/hierarchical/hiro.py
@@ -233,9 +233,13 @@ def partial_reset(self, done):
             self._done[i] = []
             self._subgoals[i] = []
 
-    @iTensor_oNumpy
     def __call__(self, obs, evaluation=False):
         self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(obs.flatten_vector()).numpy(), self._new_subgoal)
+        mu, pi = self.call(obs)
+        return mu if evaluation else pi
+
+    @iTensor_oNumpy
+    def call(self, obs):
         feat = t.cat([obs.flatten_vector(), self._subgoal], -1)
         output = self.low_actor(feat)
         if self.is_continuous:
@@ -246,7 +250,7 @@ def __call__(self, obs, evaluation=False):
             mu = logits.argmax(1)
             cate_dist = td.Categorical(logits=logits)
             pi = cate_dist.sample()
-        return mu if evaluation else pi
+        return mu, pi
 
     def get_subgoal(self, s):
         '''

diff --git a/rls/algorithms/hierarchical/ioc.py b/rls/algorithms/hierarchical/ioc.py
@@ -106,21 +106,24 @@ def __init__(self,
                                      interest_oplr=self.interest_oplr)
         self.initialize_data_buffer()
 
-        self.options = t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
+        self.options = np.random.randint(0, self.options_num, self.n_copys)
 
-    @iTensor_oNumpy
     def __call__(self, obs, evaluation=False):
         self.last_options = self.options
+        action, self.cell_state, self.options = self.call(obs, cell_state=self.cell_state, options=self.options)
+        return action
 
-        feat, self.cell_state = self.rep_net(obs, cell_state=self.cell_state)
+    @iTensor_oNumpy
+    def call(self, obs, cell_state, options):
+        feat, cell_state = self.rep_net(obs, cell_state=cell_state)
         q = self.q_net(feat)  # [B, P]
         pi = self.intra_option_net(feat)  # [B, P, A]
-        options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float()    # [B, P]
+        options_onehot = t.nn.functional.one_hot(options, self.options_num).float()    # [B, P]
         options_onehot_expanded = options_onehot.unsqueeze(-1)  # [B, P, 1]
         pi = (pi * options_onehot_expanded).sum(1)  # [B, A]
         if self.is_continuous:
             mu = pi.tanh()
-            log_std = self.log_std[self.options]
+            log_std = self.log_std[options]
             dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
             a = dist.sample().clamp(-1, 1)
         else:
@@ -129,8 +132,8 @@ def __call__(self, obs, evaluation=False):
             a = dist.sample()
         interests = self.interest_net(feat)  # [B, P]
         op_logits = interests * q  # [B, P] or q.softmax(-1)
-        self.options = td.Categorical(logits=op_logits).sample()
-        return a
+        new_options = td.Categorical(logits=op_logits).sample()
+        return a, cell_state, new_options
 
     def _target_params_update(self):
         if self.global_step % self.assign_interval == 0:

diff --git a/rls/algorithms/hierarchical/oc.py b/rls/algorithms/hierarchical/oc.py
@@ -16,7 +16,6 @@
 from rls.nn.models import (OcIntraOption,
                            CriticQvalueAll)
 from rls.nn.utils import OPLR
-from rls.utils.converter import to_numpy
 from rls.common.decorator import iTensor_oNumpy
 
 
@@ -114,22 +113,30 @@ def __init__(self,
         self.options = self._generate_random_options()
 
     def _generate_random_options(self):
-        return t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
+        return np.random.randint(0, self.options_num, self.n_copys)
 
-    @iTensor_oNumpy
     def __call__(self, obs, evaluation=False):
         self.last_options = self.options
 
-        feat, self.cell_state = self.rep_net(obs, cell_state=self.cell_state)
+        action, self.cell_state, self.options = self.call(obs, cell_state=self.cell_state, options=self.options)
+
+        if self.use_eps_greedy:
+            if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):   # epsilon greedy
+                self.options = self._generate_random_options()
+        return action
+
+    @iTensor_oNumpy
+    def call(self, obs, cell_state, options):
+        feat, cell_state = self.rep_net(obs, cell_state=cell_state)
         q = self.q_net(feat)  # [B, P]
         pi = self.intra_option_net(feat)  # [B, P, A]
         beta = self.termination_net(feat)  # [B, P]
-        options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float()    # [B, P]
+        options_onehot = t.nn.functional.one_hot(options, self.options_num).float()    # [B, P]
         options_onehot_expanded = options_onehot.unsqueeze(-1)  # [B, P, 1]
         pi = (pi * options_onehot_expanded).sum(1)  # [B, A]
         if self.is_continuous:
             mu = pi.tanh()
-            log_std = self.log_std[self.options]
+            log_std = self.log_std[options]
             dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
             a = dist.sample().clamp(-1, 1)
         else:
@@ -142,13 +149,8 @@ def __call__(self, obs, evaluation=False):
         else:
             beta_probs = (beta * options_onehot).sum(1)   # [B, P] => [B,]
             beta_dist = td.Bernoulli(probs=beta_probs)
-            new_options = t.where(beta_dist.sample() < 1, self.options, max_options)
-        self.options = to_numpy(new_options)
-
-        if self.use_eps_greedy:
-            if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):   # epsilon greedy
-                self.options = self._generate_random_options()
-        return a
+            new_options = t.where(beta_dist.sample() < 1, options, max_options)
+        return a, cell_state, new_options
 
     def _target_params_update(self):
         if self.global_step % self.assign_interval == 0: