In [30]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime
import scipy.signal

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg, get_gaeadv_vtarg
from rl_agents.vpg.agent import VPG_Agent
from rl_agents.ppo.agent import PPO_Agent
from rl_agents.policies.categorical import CategoricalActor
from rl_agents.policies.gaussian import GaussianActor
from rl_agents.common import Critic
from rl_agents.trainer.sensei import Sensei

from gym.spaces import Box, Discrete

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard.notebook extension is already loaded. To reload it, use:
  %reload_ext tensorboard.notebook


## Create GYM environment
Use Pendulum-v0 for now

In [60]:
# env_fn = lambda: gym.make('MountainCarContinuous-v0')
# env_fn = lambda: gym.make('Pendulum-v0')
env_fn = lambda: gym.make('MountainCar-v0')
# env_fn = lambda: gym.make('CartPole-v0')
env = env_fn()
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape if is_continuous else env.action_space.n

I0916 19:54:36.747818 4653385152 registration.py:117] Making new env: MountainCar-v0
[2019-09-16 19:54:36,747] Making new env: MountainCar-v0
  result = entry_point.load(False)


# Proximal Policy Optimization

## Initialization

In [24]:
actor_ppo = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_ppo = Critic(obs_dim)
jen_ppo = PPO_Agent(actor_ppo, critic_ppo, is_continuous, act_dim)
generator_ppo = rollouts_generator(jen_ppo, env, is_continuous, horizon=2048)

alg_name = "PPO"
num_ite = 200
lam = 0.95
gamma = 0.99
epochs_actor = 20
epochs_critic = 40
sensei_ppo = Sensei(jen_ppo, alg_name, env_fn,
                    ite=num_ite, horizon=2048,
                    epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

I0916 18:49:11.008467 4653385152 registration.py:117] Making new env: CartPole-v0
[2019-09-16 18:49:11,008] Making new env: CartPole-v0
I0916 18:49:11.015156 4653385152 registration.py:117] Making new env: CartPole-v0
[2019-09-16 18:49:11,015] Making new env: CartPole-v0


In [25]:
sensei_ppo.train(batch_size=256)

I0916 18:49:11.593913 4653385152 registration.py:117] Making new env: CartPole-v0
[2019-09-16 18:49:11,593] Making new env: CartPole-v0


# Vanilla Policy Gradient

## Initialization

In [62]:
actor_vpg = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_vpg = Critic(obs_dim)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, is_continuous, act_dim)
generator_vpg = rollouts_generator(jen_vpg, env, is_continuous, horizon=2048)

alg_name = "VPG"
lam = 0.95
gamma = 0.99
epochs_actor = 1
epochs_critic = 40
sensei_vpg = Sensei(jen_vpg, alg_name, env_fn,
                    horizon=2048, epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

I0916 19:54:52.931503 4653385152 registration.py:117] Making new env: MountainCar-v0
[2019-09-16 19:54:52,931] Making new env: MountainCar-v0
I0916 19:54:52.934460 4653385152 registration.py:117] Making new env: MountainCar-v0
[2019-09-16 19:54:52,934] Making new env: MountainCar-v0


# Training loop

In [21]:
num_ite = 200
sensei_vpg.train(num_ite, batch_size=256)

I0916 12:07:31.575674 4653385152 registration.py:117] Making new env: CartPole-v0
[2019-09-16 12:07:31,575] Making new env: CartPole-v0


In [69]:
num_ite = 1
sensei_vpg.train(num_ite, record=False, batch_size=256)

I0916 19:58:24.545778 4653385152 registration.py:117] Making new env: MountainCar-v0
[2019-09-16 19:58:24,545] Making new env: MountainCar-v0


+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ actor
TensorShape([256, 2])
TensorShape([256, 3])
TensorShape([256])
+++ actor
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])

+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256]

TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ criti

predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorShape([256, 2])
TensorShape([256])
+++ critic
predicted TensorShape([256, 1])
+++ critic
TensorS

In [29]:
def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.
    input: 
        vector x, 
        [x0, 
         x1, 
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [27]:
rollout = generator_ppo.__next__()

In [57]:
gae, td = get_gaeadv_vtarg(rollout, 0.95, 0.99)

In [58]:
rews, vals = rollout["rew"][:198], rollout["vpred"][:198]
rews = np.append(rews, rollout["vpred"])
vals = np.append(vals, rollout["next_vpred"])
deltas = rews[:-1] + 0.99 * vals[1:] - vals[:-1]
gae2 = discount_cumsum(deltas, 0.99*0.95)

In [59]:
np.sum(gae2 - gae)

0.0

In [56]:
gae2

array([6.27157032, 6.27156697, 6.27156577, 6.27156541, 6.27156077,
       6.27155996, 6.27155515, 6.27155386, 6.27154881, 6.27154703,
       6.27154167, 6.27153934, 6.2715336 , 6.27153068, 6.27152448,
       6.27152092, 6.27151418, 6.27150991, 6.27150254, 6.27149748,
       6.27148938, 6.27148343, 6.27147451, 6.27146756, 6.2714577 ,
       6.27144963, 6.27143869, 6.27142936, 6.27142038, 6.2714066 ,
       6.27139307, 6.27138076, 6.27136853, 6.27135168, 6.27133477,
       6.27131871, 6.27129974, 6.27128143, 6.27126013, 6.27123928,
       6.27121534, 6.27119032, 6.27116464, 6.27113767, 6.2711096 ,
       6.27107681, 6.27104478, 6.27100802, 6.27097156, 6.27093381,
       6.27088897, 6.27084588, 6.27079568, 6.27074348, 6.27068851,
       6.2706312 , 6.27057073, 6.27050433, 6.2704358 , 6.2703609 ,
       6.27028328, 6.27019876, 6.27011087, 6.27001545, 6.26991596,
       6.26980823, 6.26969416, 6.2695739 , 6.26944647, 6.26930904,
       6.26916485, 6.26900962, 6.26884503, 6.26867105, 6.26848

In [48]:
gae

array([6.27121999, 6.27119448, 6.27116972, ..., 1.05419485, 0.72411779,
       0.37316197])

In [39]:
gae-gae2

array([-0.00038363, -0.0004079 , -0.0004337 , ...,  0.        ,
        0.        ,  0.        ])

In [43]:
rollout["new"][:198]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [67]:
act_dim

3