In [61]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

In [63]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)

In [64]:
generator = rollouts_generator(vero, env, horizon=2048)
rollout = generator.__next__()

In [65]:
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)

In [66]:
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=2)

ratio: 1
ratio: 1.0002220736139966
ratio: 1.000821819882151
ratio: 0.99941570137578351
ratio: 1.0029618736356261
ratio: 1.0008034525190732
ratio: 0.99880714940146542
ratio: 0.99642975145576052
ratio: 0.99991070827547412
ratio: 0.9965896842720281
ratio: 1.0009503583676711
ratio: 0.9999652829090161
ratio: 0.99538609720992888
ratio: 0.99869287104913429
ratio: 1.0014511860669066
ratio: 1.0041919006031608
ratio: 1.000266283529982
ratio: 1.0065575418362083
ratio: 0.99855082092521563
ratio: 0.996354249329997
ratio: 0.99485256259149257
ratio: 0.99854199669763211
ratio: 1.002710114813929
ratio: 1.0122081963728538
ratio: 1.0060392069143616
ratio: 1.0102116742705034
ratio: 0.998385379103362
ratio: 1.0018994864175386
ratio: 1.0012905166846637
ratio: 1.00183017827744
ratio: 1.0009652956019162
ratio: 0.99938074321991066
ratio: 1.0030329573368961
ratio: 1.0116941127316781
ratio: 1.0067450334372721
ratio: 0.998611414906235
ratio: 1.0103327045610122
ratio: 1.0033907416453156
ratio: 0.99718169084731512


In [39]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

tf.Tensor([-0.42190623], shape=(1,), dtype=float64)
tf.Tensor([[-0.42190623]], shape=(1, 1), dtype=float64)
tf.Tensor([[0.88893853]], shape=(1, 1), dtype=float64)


In [8]:
ac = env.action_space.sample()
print(ac.dtype)

float32


In [9]:
print(np.float64)

<class 'numpy.float64'>


In [10]:
import tensorflow_probability as tfp

In [11]:
tfd = tfp.distributions


In [12]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [13]:
dist.sample()


<tf.Tensor: id=171838, shape=(4,), dtype=float64, numpy=array([0.01061005, 0.96683973, 2.05509682, 1.26374006])>

In [14]:
obs_dim

(3,)