Skip to content

Commit

Permalink
v4.1.3 fix: fixed off-policy with rnn training. (#43, #34)
Browse files Browse the repository at this point in the history
1. optimized `iTensor_oNumpy`
2. renamed `train_time_step` to `rnn_time_steps`, `burn_in_time_step` to `burn_in_time_steps`
3. optimized `on_policy_buffer.py`
4. optimized `EpisodeExperienceReplay`
5. fixed off-policy rnn training
6. optimized&fixed `to_numpy` and `to_tensor`
7. reimplemented `call` and invoking it in `__call__`
  • Loading branch information
StepNeverStop committed Jul 29, 2021
1 parent f95a510 commit 964f78b
Show file tree
Hide file tree
Showing 53 changed files with 422 additions and 334 deletions.
2 changes: 1 addition & 1 deletion rls/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# We follow Semantic Versioning (https://semver.org/)
_MAJOR_VERSION = '4'
_MINOR_VERSION = '1'
_PATCH_VERSION = '2'
_PATCH_VERSION = '3'

# Example: '0.4.2'
__version__ = '.'.join([_MAJOR_VERSION, _MINOR_VERSION, _PATCH_VERSION])
2 changes: 1 addition & 1 deletion rls/algorithms/base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def write_summaries(self,
summaries: Dict = {},
writer: Optional[SummaryWriter] = None) -> NoReturn:
'''
write tf summaries showing in tensorboard.
write summaries showing in tensorboard.
'''
if not self.no_save:
writer = writer or self.writer
Expand Down
12 changes: 6 additions & 6 deletions rls/algorithms/base/ma_off_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def __init__(self,
envspecs,
buffer_size=10000,
n_step=1,
burn_in_time_step=10,
train_time_step=10,
burn_in_time_steps=10,
rnn_time_steps=10,
episode_batch_size=32,
episode_buffer_size=10000,
train_times_per_step=1,
Expand All @@ -32,8 +32,8 @@ def __init__(self,

self.buffer_size = buffer_size
self.n_step = n_step
self.burn_in_time_step = burn_in_time_step
self.train_time_step = train_time_step
self.burn_in_time_steps = burn_in_time_steps
self.rnn_time_steps = rnn_time_steps
self.episode_batch_size = episode_batch_size
self.episode_buffer_size = episode_buffer_size
self.train_times_per_step = train_times_per_step
Expand All @@ -48,8 +48,8 @@ def initialize_data_buffer(self) -> NoReturn:
# _buffer_args.update(
# batch_size=self.episode_batch_size,
# capacity=self.episode_buffer_size,
# burn_in_time_step=self.burn_in_time_step,
# train_time_step=self.train_time_step,
# burn_in_time_steps=self.burn_in_time_steps,
# rnn_time_steps=self.rnn_time_steps,
# n_copys=self.n_copys
# )
# else:
Expand Down
2 changes: 1 addition & 1 deletion rls/algorithms/base/ma_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def write_summaries(self,
summaries: Dict,
writer=None) -> NoReturn:
'''
write tf summaries showing in tensorboard.
write summaries showing in tensorboard.
'''
if 'model' in summaries.keys():
super().write_summaries(global_step, summaries=summaries.pop('model'), writer=self.writer)
Expand Down
30 changes: 16 additions & 14 deletions rls/algorithms/base/off_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from rls.algorithms.base.policy import Policy
from rls.common.yaml_ops import load_config
from rls.common.specs import BatchExperiences
from rls.common.decorator import iTensor_oNumpy


class Off_Policy(Policy):
Expand All @@ -24,8 +25,8 @@ def __init__(self,
n_step=1,
use_priority=False,
use_isw=False,
burn_in_time_step=10,
train_time_step=10,
burn_in_time_steps=10,
rnn_time_steps=10,
episode_batch_size=32,
episode_buffer_size=10000,
train_times_per_step=1,
Expand All @@ -35,8 +36,8 @@ def __init__(self,
self.n_step = n_step
self.use_priority = use_priority
self.use_isw = use_isw
self.burn_in_time_step = burn_in_time_step
self.train_time_step = train_time_step
self.burn_in_time_steps = burn_in_time_steps
self.rnn_time_steps = rnn_time_steps
self.episode_batch_size = episode_batch_size
self.episode_buffer_size = episode_buffer_size
self.train_times_per_step = train_times_per_step
Expand All @@ -51,8 +52,8 @@ def initialize_data_buffer(self) -> NoReturn:
_buffer_args.update(
batch_size=self.episode_batch_size,
capacity=self.episode_buffer_size,
burn_in_time_step=self.burn_in_time_step,
train_time_step=self.train_time_step,
burn_in_time_steps=self.burn_in_time_steps,
rnn_time_steps=self.rnn_time_steps,
n_copys=self.n_copys
)
else:
Expand Down Expand Up @@ -134,17 +135,18 @@ def _learn(self, function_dict: Dict = {}) -> NoReturn:
if self.data.can_sample:
self.intermediate_variable_reset()
data = self.get_transitions()
cell_states = {}
cell_states = {'obs': self.initial_cell_state(batch=self.episode_batch_size),
'obs_': self.initial_cell_state(batch=self.episode_batch_size)}

# --------------------------------------burn in隐状态部分
cell_states['obs'] = self.initial_cell_state(batch=self.episode_batch_size)
cell_states['obs_'] = self.initial_cell_state(batch=self.episode_batch_size)
if self.use_rnn and self.burn_in_time_step > 0:
if self.use_rnn and self.burn_in_time_steps > 0:
_burn_in_data = self.get_burn_in_transitions()
_, cell_states['obs'] = self.rep_net(obs=_burn_in_data.obs,
cell_state=cell_states['obs'])
_, cell_states['obs_'] = self.rep_net(obs=_burn_in_data.obs_,
cell_state=cell_states['obs_'])
_, cell_states['obs'] = iTensor_oNumpy(self.rep_net)(obs=_burn_in_data.obs,
cell_state=cell_states['obs'],
device=self.device)
_, cell_states['obs_'] = iTensor_oNumpy(self.rep_net)(obs=_burn_in_data.obs_,
cell_state=cell_states['obs_'],
device=self.device)
# --------------------------------------

# --------------------------------------好奇心部分
Expand Down
11 changes: 6 additions & 5 deletions rls/algorithms/base/on_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
class On_Policy(Policy):
def __init__(self,
envspec,
rnn_time_step=8,
rnn_time_steps=8,
**kwargs):
super().__init__(envspec=envspec, **kwargs)
self.rnn_time_step = rnn_time_step
self.rnn_time_steps = rnn_time_steps

def initialize_data_buffer(self, store_data_type=BatchExperiences, sample_data_type=BatchExperiences) -> NoReturn:
self.data = DataBuffer(n_copys=self.n_copys, rnn_cell_nums=self.rnn_cell_nums,
batch_size=self.batch_size, rnn_time_step=self.rnn_time_step,
self.data = DataBuffer(n_copys=self.n_copys,
batch_size=self.batch_size, rnn_time_steps=self.rnn_time_steps,
store_data_type=store_data_type, sample_data_type=sample_data_type)

def store_data(self, exps: BatchExperiences) -> NoReturn:
Expand Down Expand Up @@ -58,7 +58,8 @@ def _learn(self, function_dict: Dict) -> NoReturn:

if self.use_curiosity and not self.use_rnn:
curiosity_data = self.data.get_curiosity_data()
cell_states['obs'] = cell_states['obs_'] = self.initial_cell_state(batch=self.n_copys)
cell_states = {'obs': self.initial_cell_state(batch=self.n_copys),
'obs_': self.initial_cell_state(batch=self.n_copys)}
crsty_r, crsty_summaries = self.curiosity_model(curiosity_data, cell_states)
self.data.update_reward(crsty_r.numpy())
# self.data.r += crsty_r.numpy().reshape([self.data.eps_len, -1])
Expand Down
7 changes: 0 additions & 7 deletions rls/algorithms/base/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,6 @@ def reset(self) -> NoReturn:
'''reset model for each new episode.'''
self.cell_state = self.next_cell_state = self.initial_cell_state(batch=self.n_copys)

@property
def rnn_cell_nums(self):
if self.use_rnn:
return self.rep_net.memory_net.cell_nums
else:
return 0

def initial_cell_state(self, batch: int) -> Tuple[t.Tensor]:
if self.use_rnn:
return self.rep_net.memory_net.initial_cell_state(batch=batch)
Expand Down
39 changes: 18 additions & 21 deletions rls/algorithms/hierarchical/aoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
BatchExperiences)
from rls.nn.models import AocShare
from rls.nn.utils import OPLR
from rls.utils.converter import to_numpy
from rls.common.decorator import iTensor_oNumpy


Expand Down Expand Up @@ -81,7 +80,7 @@ def __init__(self,
self.kl_reverse = kl_reverse
self.kl_target = kl_target
self.kl_alpha = kl_alpha
self.kl_coef = t.tensor(kl_coef).float()
self.kl_coef = kl_coef

self.kl_cutoff = kl_target * kl_target_cutoff
self.kl_stop = kl_target * kl_target_earlystop
Expand Down Expand Up @@ -110,8 +109,8 @@ def __init__(self,

self.initialize_data_buffer(store_data_type=AOC_Store_BatchExperiences,
sample_data_type=AOC_Train_BatchExperiences)
self.oc_mask = t.tensor(np.zeros(self.n_copys)).int()
self.options = t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
self.oc_mask = np.zeros(self.n_copys)
self.options = np.random.randint(0, self.options_num, self.n_copys)

def reset(self):
super().reset()
Expand All @@ -121,18 +120,23 @@ def partial_reset(self, done):
super().partial_reset(done)
self._done_mask = done

@iTensor_oNumpy
def __call__(self, obs, evaluation=False):
self.last_options = self.options
rets = self.call(obs, cell_state=self.cell_state, options=self.options, _done_mask=self._done_mask)
action, self.next_cell_state, self._value, self._log_prob, self._beta_adv, self.oc_mask, self.options = rets
self._done_mask = np.full(self.n_copys, False)
return action

feat, self.next_cell_state = self.rep_net(obs, cell_state=self.cell_state) # [B, P], [B, P, A], [B, P], [B, P]
@iTensor_oNumpy
def call(self, obs, cell_state, options, _done_mask):
feat, cell_state = self.rep_net(obs, cell_state=cell_state) # [B, P], [B, P, A], [B, P], [B, P]
(q, pi, beta) = self.net(feat)
options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float() # [B, P]
options_onehot = t.nn.functional.one_hot(options, self.options_num).float() # [B, P]
options_onehot_expanded = options_onehot.unsqueeze(-1) # [B, P, 1]
pi = (pi * options_onehot_expanded).sum(1) # [B, A]
if self.is_continuous:
mu = pi
log_std = self.log_std[self.options]
log_std = self.log_std[options]
dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
sample_op = dist.sample().clamp(-1, 1)
log_prob = dist.log_prob(sample_op).unsqueeze(-1)
Expand All @@ -146,16 +150,9 @@ def __call__(self, obs, evaluation=False):
max_options = q.argmax(-1) # [B, P] => [B, ]
beta_probs = (beta * options_onehot).sum(1) # [B, P] => [B,]
beta_dist = td.Bernoulli(probs=beta_probs)
new_options = t.where(beta_dist.sample() < 1, self.options, max_options) # <1 则不改变op, =1 则改变op

new_options = t.where(self._done_mask, max_options, new_options)
self._done_mask = np.full(self.n_copys, False)
self._value = to_numpy(value)
self._log_prob = to_numpy(log_prob) + np.finfo(np.float32).eps
self._beta_adv = to_numpy(beta_adv) + self.dc
self.oc_mask = to_numpy(new_options == self.options) # equal means no change
self.options = to_numpy(new_options)
return sample_op
new_options = t.where(beta_dist.sample() < 1, options, max_options) # <1 则不改变op, =1 则改变op
new_options = t.where(_done_mask, max_options, new_options)
return sample_op, cell_state, value, log_prob+t.finfo().eps, beta_adv+self.dc, new_options == options, new_options

def store_data(self, exps: BatchExperiences):
# self._running_average()
Expand Down Expand Up @@ -188,7 +185,7 @@ def learn(self, **kwargs):
def _train(data, cell_state):
early_step = 0
for i in range(self.epoch):
loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train(data, cell_state, self.kl_coef)
loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train(data, cell_state)
if kl > self.kl_stop:
early_step = i
break
Expand Down Expand Up @@ -219,7 +216,7 @@ def _train(data, cell_state):
})

@iTensor_oNumpy
def train(self, BATCH, cell_states, kl_coef):
def train(self, BATCH, cell_states):
last_options = BATCH.last_options # [B,]
options = BATCH.options
feat, _ = self.rep_net(BATCH.obs, cell_state=cell_states['obs']) # [B, P], [B, P, A], [B, P], [B, P]
Expand Down Expand Up @@ -259,7 +256,7 @@ def train(self, BATCH, cell_states, kl_coef):
surrogate,
ratio.clamp(1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv
).mean()
kl_loss = kl_coef * kl
kl_loss = self.kl_coef * kl
extra_loss = 1000.0 * t.maximum(t.zeros_like(kl), kl - self.kl_cutoff).square()
pi_loss = pi_loss + kl_loss + extra_loss
q_loss = 0.5 * td_square.mean()
Expand Down
8 changes: 6 additions & 2 deletions rls/algorithms/hierarchical/hiro.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,13 @@ def partial_reset(self, done):
self._done[i] = []
self._subgoals[i] = []

@iTensor_oNumpy
def __call__(self, obs, evaluation=False):
self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(obs.flatten_vector()).numpy(), self._new_subgoal)
mu, pi = self.call(obs)
return mu if evaluation else pi

@iTensor_oNumpy
def call(self, obs):
feat = t.cat([obs.flatten_vector(), self._subgoal], -1)
output = self.low_actor(feat)
if self.is_continuous:
Expand All @@ -246,7 +250,7 @@ def __call__(self, obs, evaluation=False):
mu = logits.argmax(1)
cate_dist = td.Categorical(logits=logits)
pi = cate_dist.sample()
return mu if evaluation else pi
return mu, pi

def get_subgoal(self, s):
'''
Expand Down
17 changes: 10 additions & 7 deletions rls/algorithms/hierarchical/ioc.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,24 @@ def __init__(self,
interest_oplr=self.interest_oplr)
self.initialize_data_buffer()

self.options = t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
self.options = np.random.randint(0, self.options_num, self.n_copys)

@iTensor_oNumpy
def __call__(self, obs, evaluation=False):
self.last_options = self.options
action, self.cell_state, self.options = self.call(obs, cell_state=self.cell_state, options=self.options)
return action

feat, self.cell_state = self.rep_net(obs, cell_state=self.cell_state)
@iTensor_oNumpy
def call(self, obs, cell_state, options):
feat, cell_state = self.rep_net(obs, cell_state=cell_state)
q = self.q_net(feat) # [B, P]
pi = self.intra_option_net(feat) # [B, P, A]
options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float() # [B, P]
options_onehot = t.nn.functional.one_hot(options, self.options_num).float() # [B, P]
options_onehot_expanded = options_onehot.unsqueeze(-1) # [B, P, 1]
pi = (pi * options_onehot_expanded).sum(1) # [B, A]
if self.is_continuous:
mu = pi.tanh()
log_std = self.log_std[self.options]
log_std = self.log_std[options]
dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
a = dist.sample().clamp(-1, 1)
else:
Expand All @@ -129,8 +132,8 @@ def __call__(self, obs, evaluation=False):
a = dist.sample()
interests = self.interest_net(feat) # [B, P]
op_logits = interests * q # [B, P] or q.softmax(-1)
self.options = td.Categorical(logits=op_logits).sample()
return a
new_options = td.Categorical(logits=op_logits).sample()
return a, cell_state, new_options

def _target_params_update(self):
if self.global_step % self.assign_interval == 0:
Expand Down
28 changes: 15 additions & 13 deletions rls/algorithms/hierarchical/oc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from rls.nn.models import (OcIntraOption,
CriticQvalueAll)
from rls.nn.utils import OPLR
from rls.utils.converter import to_numpy
from rls.common.decorator import iTensor_oNumpy


Expand Down Expand Up @@ -114,22 +113,30 @@ def __init__(self,
self.options = self._generate_random_options()

def _generate_random_options(self):
return t.tensor(np.random.randint(0, self.options_num, self.n_copys)).int()
return np.random.randint(0, self.options_num, self.n_copys)

@iTensor_oNumpy
def __call__(self, obs, evaluation=False):
self.last_options = self.options

feat, self.cell_state = self.rep_net(obs, cell_state=self.cell_state)
action, self.cell_state, self.options = self.call(obs, cell_state=self.cell_state, options=self.options)

if self.use_eps_greedy:
if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): # epsilon greedy
self.options = self._generate_random_options()
return action

@iTensor_oNumpy
def call(self, obs, cell_state, options):
feat, cell_state = self.rep_net(obs, cell_state=cell_state)
q = self.q_net(feat) # [B, P]
pi = self.intra_option_net(feat) # [B, P, A]
beta = self.termination_net(feat) # [B, P]
options_onehot = t.nn.functional.one_hot(self.options, self.options_num).float() # [B, P]
options_onehot = t.nn.functional.one_hot(options, self.options_num).float() # [B, P]
options_onehot_expanded = options_onehot.unsqueeze(-1) # [B, P, 1]
pi = (pi * options_onehot_expanded).sum(1) # [B, A]
if self.is_continuous:
mu = pi.tanh()
log_std = self.log_std[self.options]
log_std = self.log_std[options]
dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
a = dist.sample().clamp(-1, 1)
else:
Expand All @@ -142,13 +149,8 @@ def __call__(self, obs, evaluation=False):
else:
beta_probs = (beta * options_onehot).sum(1) # [B, P] => [B,]
beta_dist = td.Bernoulli(probs=beta_probs)
new_options = t.where(beta_dist.sample() < 1, self.options, max_options)
self.options = to_numpy(new_options)

if self.use_eps_greedy:
if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): # epsilon greedy
self.options = self._generate_random_options()
return a
new_options = t.where(beta_dist.sample() < 1, options, max_options)
return a, cell_state, new_options

def _target_params_update(self):
if self.global_step % self.assign_interval == 0:
Expand Down
Loading

0 comments on commit 964f78b

Please sign in to comment.