In [6]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

tf.random.set_seed(0)

In [26]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

In [27]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)

In [28]:
generator = rollouts_generator(vero, env, horizon=2048)

In [22]:

rollout = generator.__next__()

In [23]:
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)

In [24]:
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

0
ratio: 1
ratio: 0.99986536579738594
ratio: 1.0002050306771106
ratio: 1.0000960784213153
ratio: 1.0003117955160292
ratio: 1.0004834391937216
ratio: 0.99926434532689923
ratio: 1.0006816304514778
ratio: 1.0001126785073315
ratio: 0.99794626571821765
ratio: 0.997715335885212
ratio: 1.0024440811561544
ratio: 1.0022740357153386
ratio: 1.0001499875573048
ratio: 1.0000772947360512
ratio: 0.99941493507176959
ratio: 1.0002766605937596
ratio: 0.99822019019761166
ratio: 0.99982106261295989
ratio: 1.0004539418544824
ratio: 1.0012911335335788
ratio: 1.0014382488972178
ratio: 0.99894058165126687
ratio: 0.99605942045174123
ratio: 1.0028261540144627
ratio: 1.0066989862779403
ratio: 1.0018653282597774
ratio: 1.0064422061160445
ratio: 0.9961470337060665
ratio: 0.99960341926649243
ratio: 0.9986721303510917
ratio: 1.0108206816435
1
ratio: 0.99363130512267717
ratio: 0.9967984866073103
ratio: 1.00303683154325
ratio: 0.99818918871425055
ratio: 0.998604596870793
ratio: 1.0079575464835486
ratio: 1.000230047499

# Training loop

In [29]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

for i in range(num_ite):
    roll = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=num_epochs)
    rewards = np.array(roll["ep_rets"])
    
    if i % 10 == 0 or i == num_ite-1:
        mean, std = rewards.mean(), rewards.std()
        print('mean', mean)
        print('std', std)

mean -1212.197886086397
std 231.7375959857599
mean -1493.4496442358027
std 102.53170961987499
mean -1543.9191801547904
std 101.78788577157268
mean -1572.9159071051088
std 148.4869459656204
mean -1607.6499063854849
std 166.63569543188007
mean -1494.870606405347
std 227.9790688388157
mean -1370.627460615131
std 202.99956137229623
mean nan
std nan


KeyboardInterrupt: 

In [39]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

tf.Tensor([-0.42190623], shape=(1,), dtype=float64)
tf.Tensor([[-0.42190623]], shape=(1, 1), dtype=float64)
tf.Tensor([[0.88893853]], shape=(1, 1), dtype=float64)


In [8]:
ac = env.action_space.sample()
print(ac.dtype)

float32


In [9]:
print(np.float64)

<class 'numpy.float64'>


In [10]:
import tensorflow_probability as tfp

In [11]:
tfd = tfp.distributions


In [12]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [13]:
dist.sample()


<tf.Tensor: id=171838, shape=(4,), dtype=float64, numpy=array([0.01061005, 0.96683973, 2.05509682, 1.26374006])>

In [14]:
obs_dim

(3,)