Merge pull request DLR-RM#6 from Antonin-Raffin/feat/sde-features

Feature Extract for SDE
Shunian-Chen · Jan 20, 2020 · 358b27e · 358b27e
2 parents 8874b9d + 0bed698
commit 358b27e
Show file tree

Hide file tree

Showing 30 changed files with 1,105 additions and 386 deletions.
diff --git a/README.md b/README.md
@@ -14,24 +14,9 @@ PyTorch version of [Stable Baselines](https://github.com/hill-a/stable-baselines
 - SAC
 - TD3
 
+- SDE support for A2C, PPO, SAC and TD3.
+
 
 ## Roadmap
 
-TODO:
-- save/load
-- better predict
-- complete logger
-- SDE: learn the feature extractor?
-- Refactor: buffer with numpy array instead of pytorch
-- Refactor: remove duplicated code for evaluation
-- plotting? -> zoo
-
-Later:
-- get_parameters / set_parameters
-- CNN policies + normalization
-- tensorboard support
-- DQN
-- TRPO
-- ACER
-- DDPG
-- HER -> use stable-baselines because does not depends on tf?
+- cf github Roadmap
diff --git a/setup.py b/setup.py
@@ -23,6 +23,12 @@
             'sphinx',
             'sphinx-autobuild',
             'sphinx-rtd-theme'
+        ],
+        'extra': [
+            # For render
+            'opencv-python',
+            # For reading logs
+            'pandas'
         ]
       },
       description='Pytorch version of Stable Baselines, implementations of reinforcement learning algorithms.',
@@ -34,7 +40,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.0.6a",
+      version="0.0.8a0",
       )
 
 # python setup.py sdist

diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py
@@ -1,6 +1,3 @@
-import os
-
-import gym
 import pytest
 
 from torchy_baselines import PPO
@@ -15,4 +12,4 @@
     [12, dict(pi=[8])],
 ])
 def test_flexible_mlp(net_arch):
-    model = PPO('MlpPolicy', 'CartPole-v1', policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000)
+    _ = PPO('MlpPolicy', 'CartPole-v1', policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000)
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
@@ -1,8 +1,10 @@
-import numpy as np
+import pytest
 import torch as th
 
-from torchy_baselines.common.distributions import DiagGaussianDistribution, SquashedDiagGaussianDistribution,\
-    CategoricalDistribution, TanhBijector
+from torchy_baselines.common.distributions import DiagGaussianDistribution, TanhBijector, \
+    StateDependentNoiseDistribution
+from torchy_baselines.common.utils import set_random_seed
+
 
 # TODO: more tests for the other distributions
 def test_bijector():
@@ -18,3 +20,51 @@ def test_bijector():
     assert th.max(th.abs(squashed_actions)) <= 1.0
     # Check the inverse method
     assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
+
+
+def test_sde_distribution():
+    n_samples = int(5e6)
+    n_features = 2
+    n_actions = 1
+    deterministic_actions = th.ones(n_samples, n_actions) * 0.1
+    state = th.ones(n_samples, n_features) * 0.3
+    dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False)
+
+    set_random_seed(1)
+    _, log_std = dist.proba_distribution_net(n_features)
+    dist.sample_weights(log_std, batch_size=n_samples)
+
+    actions, _ = dist.proba_distribution(deterministic_actions, log_std, state)
+
+    assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=1e-3)
+    assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=1e-3)
+
+
+N_ACTIONS = 1
+
+
+# TODO: fix for num action > 1
+# TODO: analytical form for squashed Gaussian?
+@pytest.mark.parametrize("dist", [
+    DiagGaussianDistribution(N_ACTIONS),
+    StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),
+])
+def test_entropy(dist):
+    # The entropy can be approximated by averaging the negative log likelihood
+    # mean negative log likelihood == differential entropy
+    n_samples = int(5e6)
+    n_features = 3
+    set_random_seed(1)
+    state = th.rand(n_samples, n_features)
+    deterministic_actions = th.rand(n_samples, N_ACTIONS)
+    _, log_std = dist.proba_distribution_net(n_features, log_std_init=th.log(th.tensor(0.2)))
+
+    if isinstance(dist, DiagGaussianDistribution):
+        actions, dist = dist.proba_distribution(deterministic_actions, log_std)
+    else:
+        dist.sample_weights(log_std, batch_size=n_samples)
+        actions, dist = dist.proba_distribution(deterministic_actions, log_std, state)
+
+    entropy = dist.entropy()
+    log_prob = dist.log_prob(actions)
+    assert th.allclose(entropy.mean(), -log_prob.mean(), rtol=5e-3)
diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -0,0 +1,82 @@
+import os
+import shutil
+
+import pytest
+import numpy as np
+
+from torchy_baselines.common.logger import (make_output_format, read_csv, read_json, DEBUG, ScopedConfigure,
+    info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, error, reset)
+
+KEY_VALUES = {
+    "test": 1,
+    "b": -3.14,
+    "8": 9.9,
+    "l": [1, 2],
+    "a": np.array([1, 2, 3]),
+    "f": np.array(1),
+    "g": np.array([[[1]]]),
+}
+
+LOG_DIR = '/tmp/torchy_baselines/'
+
+
+def test_main():
+    """
+    tests for the logger module
+    """
+    info("hi")
+    debug("shouldn't appear")
+    set_level(DEBUG)
+    debug("should appear")
+    folder = "/tmp/testlogging"
+    if os.path.exists(folder):
+        shutil.rmtree(folder)
+    configure(folder=folder)
+    logkv("a", 3)
+    logkv("b", 2.5)
+    dumpkvs()
+    logkv("b", -2.5)
+    logkv("a", 5.5)
+    dumpkvs()
+    info("^^^ should see a = 5.5")
+    logkv_mean("b", -22.5)
+    logkv_mean("b", -44.4)
+    logkv("a", 5.5)
+    dumpkvs()
+    with ScopedConfigure(None, None):
+        info("^^^ should see b = 33.3")
+
+    with ScopedConfigure("/tmp/test-logger/", ["json"]):
+        logkv("b", -2.5)
+        dumpkvs()
+
+    reset()
+    logkv("a", "longasslongasslongasslongasslongasslongassvalue")
+    dumpkvs()
+    warn("hey")
+    error("oh")
+    logkvs({"test": 1})
+
+
+@pytest.mark.parametrize('_format', ['stdout', 'log', 'json', 'csv'])
+def test_make_output(_format):
+    """
+    test make output
+
+    :param _format: (str) output format
+    """
+    writer = make_output_format(_format, LOG_DIR)
+    writer.writekvs(KEY_VALUES)
+    if _format == "csv":
+        read_csv(LOG_DIR + 'progress.csv')
+    elif _format == 'json':
+        read_json(LOG_DIR + 'progress.json')
+    writer.close()
+
+
+def test_make_output_fail():
+    """
+    test value error on logger
+    """
+    with pytest.raises(ValueError):
+        make_output_format('dummy_format', LOG_DIR)
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -37,9 +37,10 @@ def test_onpolicy(model_class, env_id):
     os.remove("test_save.zip")
 
 
-def test_sac():
+@pytest.mark.parametrize("ent_coef", ['auto', 0.01])
+def test_sac(ent_coef):
     model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
-                learning_starts=100, verbose=1, create_eval_env=True, ent_coef='auto',
+                learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef,
                 action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
     model.learn(total_timesteps=1000, eval_freq=500)
     model.save("test_save")

diff --git a/tests/test_save_load.py b/tests/test_save_load.py
@@ -1,13 +1,12 @@
+import numpy as np
 import os
 import pytest
-from copy import deepcopy
-import numpy as np
-
 import torch as th
+from copy import deepcopy
 
 from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
+from torchy_baselines.common.identity_env import IdentityEnvBox
 from torchy_baselines.common.vec_env import DummyVecEnv
-from torchy_baselines.common.identity_env import IdentityEnvBox, IdentityEnv
 
 MODEL_LIST = [
     CEMRL,
@@ -81,7 +80,8 @@ def test_save_load(model_class):
     for optimizer, opt_state in opt_params.items():
         for param_group_idx, param_group in enumerate(opt_state['param_groups']):
             for param_key, param_value in param_group.items():
-                if param_key == 'params':  # don't know how to handle params correctly, therefore only check if we have the same amount
+                # don't know how to handle params correctly, therefore only check if we have the same amount
+                if param_key == 'params':
                     assert len(param_value) == len(
                         new_opt_params[optimizer]['param_groups'][param_group_idx][param_key])
                 else:

diff --git a/tests/test_sde.py b/tests/test_sde.py
@@ -1,69 +1,65 @@
 import pytest
-
-import gym
 import torch as th
 from torch.distributions import Normal
 
-from torchy_baselines import A2C, TD3
-from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize
-from torchy_baselines.common.monitor import Monitor
+from torchy_baselines import A2C, TD3, SAC
 
 
-def test_state_dependent_exploration():
+def test_state_dependent_exploration_grad():
     """
     Check that the gradient correspond to the expected one
     """
     n_states = 2
     state_dim = 3
-    # TODO: fix for action_dim > 1
-    action_dim = 1
-    sigma = th.ones(state_dim, 1, requires_grad=True)
+    action_dim = 10
+    sigma_hat = th.ones(state_dim, action_dim, requires_grad=True)
     # Reduce the number of parameters
     # sigma_ = th.ones(state_dim, action_dim) * sigma_
-
     # weights_dist = Normal(th.zeros_like(log_sigma), th.exp(log_sigma))
     th.manual_seed(2)
-    weights_dist = Normal(th.zeros_like(sigma), sigma)
-
+    weights_dist = Normal(th.zeros_like(sigma_hat), sigma_hat)
     weights = weights_dist.rsample()
+
     state = th.rand(n_states, state_dim)
     mu = th.ones(action_dim)
     # print(weights.shape, state.shape)
     noise = th.mm(state, weights)
 
-    variance = th.mm(state ** 2, sigma ** 2)
+    action = mu + noise
+
+    variance = th.mm(state ** 2, sigma_hat ** 2)
     action_dist = Normal(mu, th.sqrt(variance))
 
-    loss = action_dist.log_prob((mu + noise).detach()).mean()
+    # Sum over the action dimension because we assume they are independent
+    loss = action_dist.log_prob(action.detach()).sum(dim=-1).mean()
     loss.backward()
 
-    # From Rueckstiess paper
-    grad = th.zeros_like(sigma)
+    # From Rueckstiess paper: check that the computed gradient
+    # correspond to the analytical form
+    grad = th.zeros_like(sigma_hat)
     for j in range(action_dim):
+        # sigma_hat is the std of the gaussian distribution of the noise matrix weights
+        # sigma_j = sum_j(state_i **2 * sigma_hat_ij ** 2)
+        # sigma_j is the standard deviation of the policy gaussian distribution
+        sigma_j = th.sqrt(variance[:, j])
         for i in range(state_dim):
-            a = ((noise[:, j] ** 2 - variance[:, j]) / (variance[:, j] ** 2)) * (state[:, i] ** 2 * sigma[i, j])
-            grad[i, j] = a.mean()
+            # Derivative of the log probability of the jth component of the action
+            # w.r.t. the standard deviation sigma_j
+            d_log_policy_j = (noise[:, j] ** 2 - sigma_j ** 2) / sigma_j ** 3
+            # Derivative of sigma_j w.r.t. sigma_hat_ij
+            d_log_sigma_j = (state[:, i] ** 2 * sigma_hat[i, j]) / sigma_j
+            # Chain rule, average over the minibatch
+            grad[i, j] = (d_log_policy_j * d_log_sigma_j).mean()
 
     # sigma.grad should be equal to grad
-    assert sigma.grad.allclose(grad)
-
-
-@pytest.mark.parametrize("model_class", [A2C])
-def test_state_dependent_noise(model_class):
-    env_id = 'MountainCarContinuous-v0'
-
-    env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), norm_reward=True)
-    eval_env = VecNormalize(DummyVecEnv([lambda: Monitor(gym.make(env_id))]), training=False, norm_reward=False)
-
-    model = model_class('MlpPolicy', env, n_steps=200, use_sde=True, ent_coef=0.00, verbose=1, learning_rate=3e-4,
-                        policy_kwargs=dict(log_std_init=0.0, ortho_init=False), seed=None)
-    model.learn(total_timesteps=int(1000), log_interval=5, eval_freq=500, eval_env=eval_env)
+    assert sigma_hat.grad.allclose(grad)
 
 
-@pytest.mark.parametrize("model_class", [TD3])
-def test_state_dependent_offpolicy_noise(model_class):
+@pytest.mark.parametrize("model_class", [TD3, SAC, A2C])
+@pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
+def test_state_dependent_offpolicy_noise(model_class, sde_net_arch):
     model = model_class('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
-                        verbose=1, policy_kwargs=dict(log_std_init=-2))
+                        verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch))
     model.learn(total_timesteps=int(1000), eval_freq=500)
 
 
@@ -72,6 +68,6 @@ def scheduler(progress):
         return -2.0 * progress + 1
 
     model = TD3('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
-                        verbose=1, sde_log_std_scheduler=scheduler)
+                verbose=1, sde_log_std_scheduler=scheduler)
     model.learn(total_timesteps=int(1000), eval_freq=500)
     assert th.isclose(model.actor.log_std, th.ones_like(model.actor.log_std)).all()
diff --git a/tests/test_vec_envs.py b/tests/test_vec_envs.py
@@ -325,9 +325,8 @@ def make_env():
     assert wrapped.name_test() == CustomWrapperBB
 
     double_wrapped = CustomWrapperA(CustomWrapperB(wrapped))
-    dummy = double_wrapped.var_a  # should not raise as it is directly defined here
+    _ = double_wrapped.var_a  # should not raise as it is directly defined here
     with pytest.raises(AttributeError):  # should raise due to ambiguity
-        dummy = double_wrapped.var_b
+        _ = double_wrapped.var_b
     with pytest.raises(AttributeError):  # should raise as does not exist
-        dummy = double_wrapped.nonexistent_attribute
-    del dummy  # keep linter happy
+        _ = double_wrapped.nonexistent_attribute