In [4]:
import tensorflow as tf
import gym
import numpy as np
import datetime

# from rl_agents.env_utils import rollouts_generator, get_adv_vtarg, get_gaeadv_vtarg
from rl_agents.vpg.agent import VPG_Agent
from rl_agents.ppo.agent import PPO_Agent
from rl_agents.training.buffers import GAE_Buffer
from rl_agents.training.sensei import Sensei, ExperimentRunner
from rl_agents.utils import get_actor_critic, simple_run, Logger

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

tf.random.set_seed(0)
tf.keras.backend.set_floatx('float32')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create GYM environment
Use Pendulum-v0 for now

In [8]:
# env_fn = lambda: gym.make('MountainCarContinuous-v0')
env_fn = lambda: gym.make('Pendulum-v0')
# env_fn = lambda: gym.make('MountainCar-v0')
# env_fn = lambda: gym.make('CartPole-v0')
# env_fn = lambda: gym.make('LunarLanderContinuous-v2')
# env_fn = lambda: gym.make('Acrobot-v1')

env = env_fn()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape or env.action_space.n

In [12]:
lam = 0.95
gamma = 0.99
buffer_size = 4096

logger = Logger(env.unwrapped.spec.id)

buff_vpg = GAE_Buffer(obs_dim, act_dim, buffer_size, gamma=gamma, lam=lam)
actor_vpg, critic_vpg = get_actor_critic(env)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, logger=logger)

epochs_actor = 1
epochs_critic = 80
sensei_vpg = Sensei(jen_vpg, env_fn, buff_vpg,
                    epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    logger=logger)

In [13]:
num_ite = 200
sensei_vpg.train(num_ite, batch_size=128)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

-329.29993
-384.77612
-431.63824
-472.97665
-524.8154
-491.02197
-548.6677
-613.15674
-562.34515
-516.8168
-547.1013
-527.65643
-536.3221
-522.0013
-646.14966
-533.3036
-646.97394
-522.60297
-569.0261
-463.31598
-496.78296
-520.1948
-529.9783
-563.0763
-512.5211
-577.4538
-624.0139
-628.26776
-553.6127
-522.5342
-507.23932
-499.2244
-491.99146
-511.5661
-497.34113
-484.82297
-496.7782
-486.9269


In [4]:
buff_ppo = GAE_Buffer(obs_dim, act_dim, 2048, gamma=0.99, lam=0.95)
actor_ppo, critic_ppo = get_actor_critic(env)
jen_ppo = PPO_Agent(actor_ppo, critic_ppo, act_dim)

lam = 0.95
gamma = 0.99
epochs_actor = 10
epochs_critic = 80
sensei_ppo = Sensei(jen_ppo, env_fn, buff_ppo,
                    epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

In [5]:
num_ite = 200
sensei_ppo.train(num_ite, batch_size=128)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

0 20.63917525773196
1 9.538812785388128
2 9.460829493087557
3 9.34703196347032
4 9.290909090909091
5 9.331818181818182
6 9.333333333333334
7 9.313636363636364
8 9.351598173515981
9 9.337899543378995
10 9.378995433789955
11 9.444444444444445
12 9.285067873303168
13 9.371559633027523
14 25.65
15 21.49473684210526
16 15.694656488549619
17 16.983193277310924
18 15.930232558139535
19 14.1655172413793

# Testing buffer

In [None]:
a = tf.keras.optimizers.Adam(3e-3)

In [None]:
a

In [None]:
buff = GAE_Buffer(obs_dim, act_dim, 20, gamma=0.99, lam=0.95)

In [None]:
actor_test, critic_test = get_actor_critic(env)

In [None]:
test_vpg = VPG_Agent(actor_test, critic_tes, act_dim)

In [None]:
test_runner = ExperimentRunner(test_vpg, env, buff)
test_runner.num_ite = 1

In [None]:
for i, rollout in enumerate(test_runner):
    print(i)

In [None]:
env = gym.make('Pendulum-v0')
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape or env.action_space.n

actor_vpg, critic_vpg = get_actor_critic(env)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, act_dim)

simple_run(env, jen_vpg)

In [None]:
buff_ppo = GAE_Buffer(obs_dim, act_dim, 2048, gamma=0.99, lam=0.95)
actor_ppo, critic_ppo = get_actor_critic(env)
jen_ppo = PPO_Agent(actor_ppo, critic_ppo, act_dim)

lam = 0.95
gamma = 0.99
epochs_actor = 10
epochs_critic = 80
sensei_ppo = Sensei(jen_ppo, env_fn, buff_ppo,
                    epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

In [None]:
num_ite = 200
sensei_ppo.train(num_ite, batch_size=256)

# Vanilla Policy Gradient

## Initialization

In [None]:
actor_vpg = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_vpg = Critic(obs_dim)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, is_continuous, act_dim)
generator_vpg = rollouts_generator(jen_vpg, env, is_continuous, horizon=2048)

alg_name = "VPG"
lam = 0.95
gamma = 0.99
epochs_actor = 1
epochs_critic = 40
sensei_vpg = Sensei(jen_vpg, alg_name, env_fn,
                    horizon=2048, epochs_actor=epochs_actor, epochs_critic=epochs_critic,
                    gamma=gamma, gae_lambda=lam,
                    log_dir='logs')

# Training loop

In [None]:
num_ite = 100
sensei_vpg.train(num_ite, batch_size=256)

In [None]:
num_ite = 50
sensei_vpg.train(num_ite, record=False, batch_size=256)

In [None]:
def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.
    input: 
        vector x, 
        [x0, 
         x1, 
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [None]:
rollout = generator_ppo.__next__()

In [None]:
gae, td = get_gaeadv_vtarg(rollout, 0.95, 0.99)

In [None]:
rews, vals = rollout["rew"][:198], rollout["vpred"][:198]
rews = np.append(rews, rollout["vpred"])
vals = np.append(vals, rollout["next_vpred"])
deltas = rews[:-1] + 0.99 * vals[1:] - vals[:-1]
gae2 = discount_cumsum(deltas, 0.99*0.95)

In [None]:
np.sum(gae2 - gae)

In [None]:
gae2

In [None]:
gae

In [None]:
gae-gae2

In [None]:
rollout["new"][:198]

In [None]:
act_dim

In [None]:
jen_vpg.actor_step

In [None]:
env.action_space.sample()

In [None]:
b = np.array([1,2])
b[:1]

In [None]:
b[-1:]

In [None]:
limits = np.array([[-5, 5], [-10, 10]])
ac = np.array([2, -19])

print(limits[0,:])
np.clip(ac, limits[:, 0], limits[:, 1])