Skip to content

Commit

Permalink
Merge pull request DLR-RM#6 from Antonin-Raffin/feat/sde-features
Browse files Browse the repository at this point in the history
Feature Extract for SDE
  • Loading branch information
araffin authored and GitHub Enterprise committed Jan 20, 2020
2 parents 8874b9d + 0bed698 commit 358b27e
Show file tree
Hide file tree
Showing 30 changed files with 1,105 additions and 386 deletions.
21 changes: 3 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,9 @@ PyTorch version of [Stable Baselines](https://github.com/hill-a/stable-baselines
- SAC
- TD3

- SDE support for A2C, PPO, SAC and TD3.


## Roadmap

TODO:
- save/load
- better predict
- complete logger
- SDE: learn the feature extractor?
- Refactor: buffer with numpy array instead of pytorch
- Refactor: remove duplicated code for evaluation
- plotting? -> zoo

Later:
- get_parameters / set_parameters
- CNN policies + normalization
- tensorboard support
- DQN
- TRPO
- ACER
- DDPG
- HER -> use stable-baselines because does not depends on tf?
- cf github Roadmap
8 changes: 7 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
'sphinx',
'sphinx-autobuild',
'sphinx-rtd-theme'
],
'extra': [
# For render
'opencv-python',
# For reading logs
'pandas'
]
},
description='Pytorch version of Stable Baselines, implementations of reinforcement learning algorithms.',
Expand All @@ -34,7 +40,7 @@
license="MIT",
long_description="",
long_description_content_type='text/markdown',
version="0.0.6a",
version="0.0.8a0",
)

# python setup.py sdist
Expand Down
5 changes: 1 addition & 4 deletions tests/test_custom_policy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import os

import gym
import pytest

from torchy_baselines import PPO
Expand All @@ -15,4 +12,4 @@
[12, dict(pi=[8])],
])
def test_flexible_mlp(net_arch):
model = PPO('MlpPolicy', 'CartPole-v1', policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000)
_ = PPO('MlpPolicy', 'CartPole-v1', policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000)
56 changes: 53 additions & 3 deletions tests/test_distributions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import numpy as np
import pytest
import torch as th

from torchy_baselines.common.distributions import DiagGaussianDistribution, SquashedDiagGaussianDistribution,\
CategoricalDistribution, TanhBijector
from torchy_baselines.common.distributions import DiagGaussianDistribution, TanhBijector, \
StateDependentNoiseDistribution
from torchy_baselines.common.utils import set_random_seed


# TODO: more tests for the other distributions
def test_bijector():
Expand All @@ -18,3 +20,51 @@ def test_bijector():
assert th.max(th.abs(squashed_actions)) <= 1.0
# Check the inverse method
assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()


def test_sde_distribution():
n_samples = int(5e6)
n_features = 2
n_actions = 1
deterministic_actions = th.ones(n_samples, n_actions) * 0.1
state = th.ones(n_samples, n_features) * 0.3
dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False)

set_random_seed(1)
_, log_std = dist.proba_distribution_net(n_features)
dist.sample_weights(log_std, batch_size=n_samples)

actions, _ = dist.proba_distribution(deterministic_actions, log_std, state)

assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3)
assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3)


N_ACTIONS = 1


# TODO: fix for num action > 1
# TODO: analytical form for squashed Gaussian?
@pytest.mark.parametrize("dist", [
DiagGaussianDistribution(N_ACTIONS),
StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),
])
def test_entropy(dist):
# The entropy can be approximated by averaging the negative log likelihood
# mean negative log likelihood == differential entropy
n_samples = int(5e6)
n_features = 3
set_random_seed(1)
state = th.rand(n_samples, n_features)
deterministic_actions = th.rand(n_samples, N_ACTIONS)
_, log_std = dist.proba_distribution_net(n_features, log_std_init=th.log(th.tensor(0.2)))

if isinstance(dist, DiagGaussianDistribution):
actions, dist = dist.proba_distribution(deterministic_actions, log_std)
else:
dist.sample_weights(log_std, batch_size=n_samples)
actions, dist = dist.proba_distribution(deterministic_actions, log_std, state)

entropy = dist.entropy()
log_prob = dist.log_prob(actions)
assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
82 changes: 82 additions & 0 deletions tests/test_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import shutil

import pytest
import numpy as np

from torchy_baselines.common.logger import (make_output_format, read_csv, read_json, DEBUG, ScopedConfigure,
info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, error, reset)

KEY_VALUES = {
"test": 1,
"b": -3.14,
"8": 9.9,
"l": [1, 2],
"a": np.array([1, 2, 3]),
"f": np.array(1),
"g": np.array([[[1]]]),
}

LOG_DIR = '/tmp/torchy_baselines/'


def test_main():
"""
tests for the logger module
"""
info("hi")
debug("shouldn't appear")
set_level(DEBUG)
debug("should appear")
folder = "/tmp/testlogging"
if os.path.exists(folder):
shutil.rmtree(folder)
configure(folder=folder)
logkv("a", 3)
logkv("b", 2.5)
dumpkvs()
logkv("b", -2.5)
logkv("a", 5.5)
dumpkvs()
info("^^^ should see a = 5.5")
logkv_mean("b", -22.5)
logkv_mean("b", -44.4)
logkv("a", 5.5)
dumpkvs()
with ScopedConfigure(None, None):
info("^^^ should see b = 33.3")

with ScopedConfigure("/tmp/test-logger/", ["json"]):
logkv("b", -2.5)
dumpkvs()

reset()
logkv("a", "longasslongasslongasslongasslongasslongassvalue")
dumpkvs()
warn("hey")
error("oh")
logkvs({"test": 1})


@pytest.mark.parametrize('_format', ['stdout', 'log', 'json', 'csv'])
def test_make_output(_format):
"""
test make output
:param _format: (str) output format
"""
writer = make_output_format(_format, LOG_DIR)
writer.writekvs(KEY_VALUES)
if _format == "csv":
read_csv(LOG_DIR + 'progress.csv')
elif _format == 'json':
read_json(LOG_DIR + 'progress.json')
writer.close()


def test_make_output_fail():
"""
test value error on logger
"""
with pytest.raises(ValueError):
make_output_format('dummy_format', LOG_DIR)
5 changes: 3 additions & 2 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ def test_onpolicy(model_class, env_id):
os.remove("test_save.zip")


def test_sac():
@pytest.mark.parametrize("ent_coef", ['auto', 0.01])
def test_sac(ent_coef):
model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto',
learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef,
action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
model.learn(total_timesteps=1000, eval_freq=500)
model.save("test_save")
Expand Down
10 changes: 5 additions & 5 deletions tests/test_save_load.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import numpy as np
import os
import pytest
from copy import deepcopy
import numpy as np

import torch as th
from copy import deepcopy

from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
from torchy_baselines.common.identity_env import IdentityEnvBox
from torchy_baselines.common.vec_env import DummyVecEnv
from torchy_baselines.common.identity_env import IdentityEnvBox, IdentityEnv

MODEL_LIST = [
CEMRL,
Expand Down Expand Up @@ -81,7 +80,8 @@ def test_save_load(model_class):
for optimizer, opt_state in opt_params.items():
for param_group_idx, param_group in enumerate(opt_state['param_groups']):
for param_key, param_value in param_group.items():
if param_key == 'params': # don't know how to handle params correctly, therefore only check if we have the same amount
# don't know how to handle params correctly, therefore only check if we have the same amount
if param_key == 'params':
assert len(param_value) == len(
new_opt_params[optimizer]['param_groups'][param_group_idx][param_key])
else:
Expand Down
66 changes: 31 additions & 35 deletions tests/test_sde.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,65 @@
import pytest

import gym
import torch as th
from torch.distributions import Normal

from torchy_baselines import A2C, TD3
from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
from torchy_baselines.common.monitor import Monitor
from torchy_baselines import A2C, TD3, SAC


def test_state_dependent_exploration():
def test_state_dependent_exploration_grad():
"""
Check that the gradient correspond to the expected one
"""
n_states = 2
state_dim = 3
# TODO: fix for action_dim > 1
action_dim = 1
sigma = th.ones(state_dim, 1, requires_grad=True)
action_dim = 10
sigma_hat = th.ones(state_dim, action_dim, requires_grad=True)
# Reduce the number of parameters
# sigma_ = th.ones(state_dim, action_dim) * sigma_

# weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
th.manual_seed(2)
weights_dist = Normal(th.zeros_like(sigma), sigma)

weights_dist = Normal(th.zeros_like(sigma_hat), sigma_hat)
weights = weights_dist.rsample()

state = th.rand(n_states, state_dim)
mu = th.ones(action_dim)
# print(weights.shape, state.shape)
noise = th.mm(state, weights)

variance = th.mm(state ** 2, sigma ** 2)
action = mu + noise

variance = th.mm(state ** 2, sigma_hat ** 2)
action_dist = Normal(mu, th.sqrt(variance))

loss = action_dist.log_prob((mu + noise).detach()).mean()
# Sum over the action dimension because we assume they are independent
loss = action_dist.log_prob(action.detach()).sum(dim=-1).mean()
loss.backward()

# From Rueckstiess paper
grad = th.zeros_like(sigma)
# From Rueckstiess paper: check that the computed gradient
# correspond to the analytical form
grad = th.zeros_like(sigma_hat)
for j in range(action_dim):
# sigma_hat is the std of the gaussian distribution of the noise matrix weights
# sigma_j = sum_j(state_i **2 * sigma_hat_ij ** 2)
# sigma_j is the standard deviation of the policy gaussian distribution
sigma_j = th.sqrt(variance[:, j])
for i in range(state_dim):
a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
grad[i, j] = a.mean()
# Derivative of the log probability of the jth component of the action
# w.r.t. the standard deviation sigma_j
d_log_policy_j = (noise[:, j] ** 2 - sigma_j ** 2) / sigma_j ** 3
# Derivative of sigma_j w.r.t. sigma_hat_ij
d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j
# Chain rule, average over the minibatch
grad[i, j] = (d_log_policy_j * d_log_sigma_j).mean()

# sigma.grad should be equal to grad
assert sigma.grad.allclose(grad)


@pytest.mark.parametrize("model_class", [A2C])
def test_state_dependent_noise(model_class):
env_id = 'MountainCarContinuous-v0'

env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)

model = model_class('MlpPolicy', env, n_steps=200, use_sde=True, ent_coef=0.00, verbose=1, learning_rate=3e-4,
policy_kwargs=dict(log_std_init=0.0, ortho_init=False), seed=None)
model.learn(total_timesteps=int(1000), log_interval=5, eval_freq=500, eval_env=eval_env)
assert sigma_hat.grad.allclose(grad)


@pytest.mark.parametrize("model_class", [TD3])
def test_state_dependent_offpolicy_noise(model_class):
@pytest.mark.parametrize("model_class", [TD3, SAC, A2C])
@pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
def test_state_dependent_offpolicy_noise(model_class, sde_net_arch):
model = model_class('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
verbose=1, policy_kwargs=dict(log_std_init=-2))
verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch))
model.learn(total_timesteps=int(1000), eval_freq=500)


Expand All @@ -72,6 +68,6 @@ def scheduler(progress):
return -2.0 * progress + 1

model = TD3('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
verbose=1, sde_log_std_scheduler=scheduler)
verbose=1, sde_log_std_scheduler=scheduler)
model.learn(total_timesteps=int(1000), eval_freq=500)
assert th.isclose(model.actor.log_std, th.ones_like(model.actor.log_std)).all()
7 changes: 3 additions & 4 deletions tests/test_vec_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,8 @@ def make_env():
assert wrapped.name_test() == CustomWrapperBB

double_wrapped = CustomWrapperA(CustomWrapperB(wrapped))
dummy = double_wrapped.var_a # should not raise as it is directly defined here
_ = double_wrapped.var_a # should not raise as it is directly defined here
with pytest.raises(AttributeError): # should raise due to ambiguity
dummy = double_wrapped.var_b
_ = double_wrapped.var_b
with pytest.raises(AttributeError): # should raise as does not exist
dummy = double_wrapped.nonexistent_attribute
del dummy # keep linter happy
_ = double_wrapped.nonexistent_attribute

0 comments on commit 358b27e

Please sign in to comment.