In [2]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.vpg.agent import VPG_Agent
from rl_agents.ppo.agent import PPO_Agent
from rl_agents.policies.categorical import CategoricalActor
from rl_agents.policies.gaussian import GaussianActor
from rl_agents.common import Critic
from rl_agents.trainer.sensei import Sensei

from gym.spaces import Box, Discrete

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard.notebook extension is already loaded. To reload it, use:
  %reload_ext tensorboard.notebook


## Create GYM environment
Use Pendulum-v0 for now

In [4]:
env_fn = lambda: gym.make('MountainCarContinuous-v0')
# env = gym.make('Pendulum-v0')
# env = gym.make('CartPole-v0')
env = env_fn()
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape if is_continuous else env.action_space.n

I0912 20:47:39.411883 4658337216 registration.py:117] Making new env: MountainCarContinuous-v0
[2019-09-12 20:47:39,411] Making new env: MountainCarContinuous-v0
  result = entry_point.load(False)


# Vanilla Policy Gradient

## Initialization

In [None]:
actor_vpg = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_vpg = Critic(obs_dim)
jen_vpg = VPG_Agent(actor_vpg, critic_vpg, is_continuous, act_dim)
generator_vpg = rollouts_generator(jen_vpg, env, is_continuous, horizon=2048)

# Training loop

In [None]:
alg_name = "VPG"
num_ite = 200
lam = 0.95
gamma = 0.99

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/{}_{}/'.format(env.unwrapped.spec.id, alg_name) + current_time
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

for i in range(num_ite):
    rollout = generator_vpg.__next__()
    adv, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    adv = (adv - adv.mean()) / (adv.std() + 1e-8)
    
    jen_vpg.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, adv, batch_size=512)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i)
    
    if i % 50 == 0 or i == num_ite-1:
        actor_vpg.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic_vpg.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')

# Proximal Policy Optimization

## Initialization

In [9]:
actor_ppo = GaussianActor(obs_dim, act_dim) if is_continuous else CategoricalActor(obs_dim, act_dim)
critic_ppo = Critic(obs_dim)
jen_ppo = PPO_Agent(actor_ppo, critic_ppo, is_continuous, act_dim)
generator_ppo = rollouts_generator(jen_ppo, env, is_continuous, horizon=2048)

alg_name = "PPO"
num_ite = 200
lam = 0.95
gamma = 0.99
epochs = 10
sensei = Sensei(jen_ppo, alg_name, env_fn,
               ite=num_ite, horizon=2048, epochs=epochs,
               gamma=gamma, gae_lambda=lam,
               log_dir='logs')

I0912 20:49:54.215289 4658337216 registration.py:117] Making new env: MountainCarContinuous-v0
[2019-09-12 20:49:54,215] Making new env: MountainCarContinuous-v0


In [14]:
sensei.train(batch_size=256)

I0912 20:51:34.101667 4658337216 registration.py:117] Making new env: MountainCarContinuous-v0
[2019-09-12 20:51:34,101] Making new env: MountainCarContinuous-v0
  result = entry_point.load(False)


NameError: name 'new_log_prob_n' is not defined

In [16]:
alg_name = "PPO"
num_ite = 200
lam = 0.95
gamma = 0.99
epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/{}_{}/'.format(env.unwrapped.spec.id, alg_name) + current_time
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

for i in range(num_ite):
    rollout = generator_ppo.__next__()
    adv, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    adv = (adv - adv.mean()) / (adv.std() + 1e-8)
    
    jen_ppo.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, adv, epochs=epochs, batch_size=256)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i)
    
    if i % 50 == 0 or i == num_ite-1:
        actor_ppo.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic_ppo.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')

In [None]:
%tensorboard --logdir logs/gradient_tape --port=8003

In [None]:
actor2 = Actor(obs_dim, act_dim, is_continuous)
critic2 = Critic(obs_dim)
vero2 = PPO_Agent(actor2, critic2)
generator2 = rollouts_generator(vero2, env, horizon=2048)

num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

for i in range(num_ite):
    print('#### iteration ###', i)
    rollout = generator2.__next__()
    # print(rollout['ac'][0:10])
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero2.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], rollout['locs'], target_value, advantage,
                  epochs=num_epochs)

In [None]:
generator = rollouts_generator(jen, env, horizon=210)

roll = generator.__next__()

adv, tar = get_adv_vtarg(roll, lam=0.95, gamma=0.99)

In [None]:
roll.keys()

In [None]:
roll['rew']

In [None]:
roll['new']

In [9]:
rollout['log_probs']

array([[-1.88714555],
       [-0.97320608],
       [-1.08957951],
       ...,
       [-0.95047626],
       [-1.2021399 ],
       [-0.94937365]])

In [None]:
env.unwrapped.spec.id

In [None]:
jen_vpg.name