In [6]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

tf.random.set_seed(0)

In [7]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

In [8]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)

In [9]:
generator = rollouts_generator(vero, env, horizon=2048)
rollout = generator.__next__()

In [10]:
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)

In [18]:
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=1)

0
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.002486753545601
1
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 0.99915064769354744
2
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0010979458369522
3
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.000597702959658
4
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 0.99918143021246841
5
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0015500764403171
6
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0004204092614342
7
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 0.99439464618910522
8
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0003183928261086
9
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0034284752192122
10
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0042810966896898
11
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0018198618681304
12
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 0.99666420868202232
13
obs_no shape (64, 3)
ac_no shape (64, 1)
ratio: 1.0012909035530244
14
obs_no shape (64, 3)
ac_n

In [39]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

tf.Tensor([-0.42190623], shape=(1,), dtype=float64)
tf.Tensor([[-0.42190623]], shape=(1, 1), dtype=float64)
tf.Tensor([[0.88893853]], shape=(1, 1), dtype=float64)


In [8]:
ac = env.action_space.sample()
print(ac.dtype)

float32


In [9]:
print(np.float64)

<class 'numpy.float64'>


In [10]:
import tensorflow_probability as tfp

In [11]:
tfd = tfp.distributions


In [12]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [13]:
dist.sample()


<tf.Tensor: id=171838, shape=(4,), dtype=float64, numpy=array([0.01061005, 0.96683973, 2.05509682, 1.26374006])>

In [14]:
obs_dim

(3,)