In [1]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime
import scipy.signal

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg, get_gaeadv_vtarg
from rl_agents.vpg.agent import VPG_Agent
from rl_agents.ppo.agent import PPO_Agent
from rl_agents.policies.categorical import CategoricalActor
from rl_agents.policies.gaussian import GaussianActor
from rl_agents.common import Critic
from rl_agents.trainer.sensei import Sensei

from gym.spaces import Box, Discrete

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

tf.random.set_seed(0)
tf.keras.backend.set_floatx('float64')

## Create GYM environment
Use Pendulum-v0 for now

In [2]:
# env_fn = lambda: gym.make('MountainCarContinuous-v0')
# env_fn = lambda: gym.make('Pendulum-v0')
# env_fn = lambda: gym.make('MountainCar-v0')
env_fn = lambda: gym.make('CartPole-v0')
env = env_fn()
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape if is_continuous else env.action_space.n

# Proximal Policy Optimization

## Initialization

In [None]:
actor_ppo = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_ppo = Critic(obs_dim)
jen_ppo = PPO_Agent(actor_ppo, critic_ppo, is_continuous, act_dim)
generator_ppo = rollouts_generator(jen_ppo, env, is_continuous, horizon=2048)

alg_name = "PPO"
num_ite = 200
lam = 0.95
gamma = 0.99
epochs_actor = 20
epochs_critic = 40
sensei_ppo = Sensei(jen_ppo, alg_name, env_fn,
                    ite=num_ite, horizon=2048,
                    epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

In [None]:
sensei_ppo.train(batch_size=256)

# Vanilla Policy Gradient

## Initialization

In [3]:
actor_vpg = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_vpg = Critic(obs_dim)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, is_continuous, act_dim)
generator_vpg = rollouts_generator(jen_vpg, env, is_continuous, horizon=2048)

alg_name = "VPG"
lam = 0.95
gamma = 0.99
epochs_actor = 1
epochs_critic = 40
sensei_vpg = Sensei(jen_vpg, alg_name, env_fn,
                    horizon=2048, epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

# Training loop

In [None]:
num_ite = 200
sensei_vpg.train(num_ite, batch_size=256)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
num_ite = 1
sensei_vpg.train(num_ite, record=False, batch_size=256)

In [None]:
def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.
    input: 
        vector x, 
        [x0, 
         x1, 
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [None]:
rollout = generator_ppo.__next__()

In [None]:
gae, td = get_gaeadv_vtarg(rollout, 0.95, 0.99)

In [None]:
rews, vals = rollout["rew"][:198], rollout["vpred"][:198]
rews = np.append(rews, rollout["vpred"])
vals = np.append(vals, rollout["next_vpred"])
deltas = rews[:-1] + 0.99 * vals[1:] - vals[:-1]
gae2 = discount_cumsum(deltas, 0.99*0.95)

In [None]:
np.sum(gae2 - gae)

In [None]:
gae2

In [None]:
gae

In [None]:
gae-gae2

In [None]:
rollout["new"][:198]

In [None]:
act_dim