Skip to content

Commit

Permalink
Merge pull request #22 from hill-a/fixing_prediction
Browse files Browse the repository at this point in the history
Fixing prediction
  • Loading branch information
araffin committed Sep 17, 2018
2 parents 0a3948f + f1f0971 commit 7d446cf
Show file tree
Hide file tree
Showing 30 changed files with 433 additions and 334 deletions.
12 changes: 10 additions & 2 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ Changelog

For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.

Master version 1.0.8.rc1 (TO BE RELEASED SOON)
Master version 2.0.0.rc0 (TO BE RELEASED SOON)
-----------------------------------------------

**Tensorboard and bug fixes**
**Tensorboard, refactoring and bug fixes**

- added more documentation (some modules from common).
- added doc about using custom env
Expand All @@ -31,6 +31,14 @@ Master version 1.0.8.rc1 (TO BE RELEASED SOON)
- added pre-built docker images + installation instructions
- added ``deterministic`` argument in the predict function
- added assert in PPO2 for recurrent policies
- fixed predict function to handle both vectorized and unwrapped environment
- added input check to the predict function
- changed DeepQ to DQN **breaking changes**
- changed DeepQPolicy to DQNPolicy **breaking changes**
- refactored ActorCritic models to reduce code duplication
- refactored Off Policy models (to begin HER and replay_buffer refactoring)
- added tests for auto vectorization detection
- fixed render function, to handle positional arguments


Release 1.0.7 (2018-08-29)
Expand Down
16 changes: 8 additions & 8 deletions docs/modules/dqn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ Example
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy
from stable_baselines import DeepQ
from stable_baselines import DQN
env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])
model = DeepQ(MlpPolicy, env, verbose=1)
model = DQN(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("deepq_cartpole")
Expand All @@ -73,17 +73,17 @@ With Atari:
from stable_baselines.common.atari_wrappers import make_atari
from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy
from stable_baselines import DeepQ
from stable_baselines import DQN
env = make_atari('BreakoutNoFrameskip-v4')
model = DeepQ(CnnPolicy, env, verbose=1)
model = DQN(CnnPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("deepq_breakout")
del model # remove to demonstrate saving and loading
DeepQ.load("deepq_breakout")
DQN.load("deepq_breakout")
obs = env.reset()
while True:
Expand All @@ -94,7 +94,7 @@ With Atari:
Parameters
----------

.. autoclass:: DeepQ
.. autoclass:: DQN
:members:
:inherited-members:

Expand Down Expand Up @@ -135,7 +135,7 @@ You can easily define a custom architecture for the policy network:
from stable_baselines.deepq.policies import FeedForwardPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import DeepQ
from stable_baselines import DQN
# Custom MLP policy of three layers of size 128 each
class CustomPolicy(FeedForwardPolicy):
Expand All @@ -149,6 +149,6 @@ You can easily define a custom architecture for the policy network:
env = gym.make('LunarLander-v2')
env = DummyVecEnv([lambda: env])
model = DeepQ(CustomPolicy, env, verbose=1)
model = DQN(CustomPolicy, env, verbose=1)
# Train the agent
model.learn(total_timesteps=100000)
4 changes: 2 additions & 2 deletions stable_baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
from stable_baselines.acer import ACER
from stable_baselines.acktr import ACKTR
from stable_baselines.ddpg import DDPG
from stable_baselines.deepq import DeepQ
from stable_baselines.deepq import DQN
from stable_baselines.gail import GAIL
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO

__version__ = "1.0.8.rc2"
__version__ = "2.0.0.rc0"
44 changes: 4 additions & 40 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import tensorflow as tf

from stable_baselines import logger
from stable_baselines.common import explained_variance, tf_util, BaseRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common import explained_variance, tf_util, ActorCriticRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
from stable_baselines.common.runners import AbstractEnvRunner
from stable_baselines.a2c.utils import discount_with_dones, Scheduler, find_trainable_variables, mse, \
total_episode_reward_logger


class A2C(BaseRLModel):
class A2C(ActorCriticRLModel):
"""
The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783
Expand Down Expand Up @@ -39,8 +39,8 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='linear', verbose=0, tensorboard_log=None,
_init_setup_model=True):

super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, policy_base=ActorCriticPolicy,
requires_vec_env=True)
super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model)

self.n_steps = n_steps
self.gamma = gamma
Expand Down Expand Up @@ -240,25 +240,6 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_

return self

def predict(self, observation, state=None, mask=None, deterministic=False):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]
observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

actions, _, states, _ = self.step(observation, state, mask, deterministic=deterministic)
return actions, states

def action_probability(self, observation, state=None, mask=None):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]
observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

return self.proba_step(observation, state, mask)

def save(self, save_path):
data = {
"gamma": self.gamma,
Expand All @@ -282,23 +263,6 @@ def save(self, save_path):

self._save_to_file(save_path, data=data, params=params)

@classmethod
def load(cls, load_path, env=None, **kwargs):
data, params = cls._load_from_file(load_path)

model = cls(policy=data["policy"], env=None, _init_setup_model=False)
model.__dict__.update(data)
model.__dict__.update(kwargs)
model.set_env(env)
model.setup_model()

restores = []
for param, loaded_p in zip(model.params, params):
restores.append(param.assign(loaded_p))
model.sess.run(restores)

return model


class A2CRunner(AbstractEnvRunner):
def __init__(self, env, model, n_steps=5, gamma=0.99):
Expand Down
46 changes: 4 additions & 42 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from stable_baselines.a2c.utils import batch_to_seq, seq_to_batch, Scheduler, find_trainable_variables, EpisodeStats, \
get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance, total_episode_reward_logger
from stable_baselines.acer.buffer import Buffer
from stable_baselines.common import BaseRLModel, tf_util, SetVerbosity, TensorboardWriter
from stable_baselines.common import ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter
from stable_baselines.common.runners import AbstractEnvRunner
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy

Expand Down Expand Up @@ -59,7 +59,7 @@ def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma):
return qret


class ACER(BaseRLModel):
class ACER(ActorCriticRLModel):
"""
The ACER (Actor-Critic with Experience Replay) model class, https://arxiv.org/abs/1611.01224
Expand Down Expand Up @@ -97,8 +97,8 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5,
replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1,
verbose=0, tensorboard_log=None, _init_setup_model=True):

super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, policy_base=ActorCriticPolicy,
requires_vec_env=True)
super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model)

self.n_steps = n_steps
self.replay_ratio = replay_ratio
Expand Down Expand Up @@ -518,27 +518,6 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_

return self

def predict(self, observation, state=None, mask=None, deterministic=False):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]

observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

actions, _, states, _ = self.step(observation, state, mask, deterministic=deterministic)
return actions, states

def action_probability(self, observation, state=None, mask=None):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]

observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

return self.proba_step(observation, state, mask)

def save(self, save_path):
data = {
"gamma": self.gamma,
Expand All @@ -564,23 +543,6 @@ def save(self, save_path):

self._save_to_file(save_path, data=data, params=params)

@classmethod
def load(cls, load_path, env=None, **kwargs):
data, params = cls._load_from_file(load_path)

model = cls(policy=data["policy"], env=env, _init_setup_model=False)
model.__dict__.update(data)
model.__dict__.update(kwargs)
model.set_env(env)
model.setup_model()

restores = []
for param, loaded_p in zip(model.params, params):
restores.append(param.assign(loaded_p))
model.sess.run(restores)

return model


class _Runner(AbstractEnvRunner):
def __init__(self, env, model, n_steps):
Expand Down
44 changes: 4 additions & 40 deletions stable_baselines/acktr/acktr_disc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
from gym.spaces import Box

from stable_baselines import logger
from stable_baselines.common import explained_variance, BaseRLModel, tf_util, SetVerbosity, TensorboardWriter
from stable_baselines.common import explained_variance, ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter
from stable_baselines.a2c.a2c import A2CRunner
from stable_baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse, \
total_episode_reward_logger
from stable_baselines.acktr import kfac
from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy


class ACKTR(BaseRLModel):
class ACKTR(ActorCriticRLModel):
"""
The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144
Expand All @@ -43,8 +43,8 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01,
learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
tensorboard_log=None, _init_setup_model=True):

super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, policy_base=ActorCriticPolicy,
requires_vec_env=True)
super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model)

self.n_steps = n_steps
self.gamma = gamma
Expand Down Expand Up @@ -302,25 +302,6 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_

return self

def predict(self, observation, state=None, mask=None, deterministic=False):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]
observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

actions, _, states, _ = self.step(observation, state, mask, deterministic=deterministic)
return actions, states

def action_probability(self, observation, state=None, mask=None):
if state is None:
state = self.initial_state
if mask is None:
mask = [False for _ in range(self.n_envs)]
observation = np.array(observation).reshape((-1,) + self.observation_space.shape)

return self.proba_step(observation, state, mask)

def save(self, save_path):
data = {
"gamma": self.gamma,
Expand All @@ -344,20 +325,3 @@ def save(self, save_path):
params = self.sess.run(self.params)

self._save_to_file(save_path, data=data, params=params)

@classmethod
def load(cls, load_path, env=None, **kwargs):
data, params = cls._load_from_file(load_path)

model = cls(policy=data["policy"], env=env, _init_setup_model=False)
model.__dict__.update(data)
model.__dict__.update(kwargs)
model.set_env(env)
model.setup_model()

restores = []
for param, loaded_p in zip(model.params, params):
restores.append(param.assign(loaded_p))
model.sess.run(restores)

return model
7 changes: 4 additions & 3 deletions stable_baselines/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# flake8: noqa F403
from stable_baselines.common.console_util import fmt_row, fmt_item, colorize
from stable_baselines.common.dataset import Dataset
from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, explained_variance_2d,\
flatten_arrays, unflatten_vector
from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, \
explained_variance_2d, flatten_arrays, unflatten_vector
from stable_baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\
boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load
from stable_baselines.common.base_class import BaseRLModel, SetVerbosity, TensorboardWriter
from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \
TensorboardWriter

0 comments on commit 7d446cf

Please sign in to comment.