In [1]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

## Create GYM environment
Use Pendulum-v0 for now

In [2]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

I0805 15:39:25.075204 4562228672 registration.py:117] Making new env: Pendulum-v0
[2019-08-05 15:39:25,075] Making new env: Pendulum-v0
  result = entry_point.load(False)


In [None]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)
generator = rollouts_generator(vero, env, horizon=2048)

# Training loop

In [None]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [None]:
for i in range(num_ite):
    rollout = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                 epochs=num_epochs)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i*num_epochs)
    
    if i % 50 == 0 or i == num_ite-1:
        actor.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')
    #    mean, std = rewards.mean(), rewards.std()
    #    print('mean', mean)
    #    print('std', std)

In [None]:
%tensorboard --logdir logs/gradient_tape

In [15]:
actor2 = Actor(obs_dim, act_dim, is_continuous)
critic2 = Critic(obs_dim)
vero2 = PPO_Agent(actor2, critic2)
generator2 = rollouts_generator(vero2, env, horizon=2048)

num_ite = 2
lam = 0.95
gamma = 0.99
num_epochs = 2

for i in range(num_ite):
    rollout = generator2.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero2.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                  epochs=num_epochs)

<tf.Variable 'std:0' shape=(1, 1) dtype=float64, numpy=array([[0.58860497]])>
0 0
new Tensor("Mean:0", shape=(), dtype=float64)
old Tensor("Mean_1:0", shape=(), dtype=float64)
new Tensor("Mean:0", shape=(), dtype=float64)
old Tensor("Mean_1:0", shape=(), dtype=float64)
diff: -4.0766001685454967e-17
0 1
diff: -0.0004211451810702772
0 2
diff: 0.00081224113848570988
1 0
diff: -0.0063206756430890113
1 1
diff: -0.0026559922109236425
1 2
diff: 0.00014958576504965366
<tf.Variable 'std:0' shape=(1, 1) dtype=float64, numpy=array([[0.58864946]])>
0 0
diff: -8.05096362967335e-05
0 1
diff: 0.00017542572215368396
0 2
diff: -0.00030279099071196542
1 0
diff: -0.00012270078524950333
1 1
diff: 0.00027628637488492404
1 2
diff: -0.00083996432288247023


In [None]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

In [None]:
ac = env.action_space.sample()
print(ac.dtype)

In [None]:
print(np.float64)

In [None]:
import tensorflow_probability as tfp

In [None]:
tfd = tfp.distributions


In [None]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [None]:
dist.sample()


In [None]:
env.observation_space.sample()

In [None]:
%load_ext tensorboard.notebook

In [None]:
generator = rollouts_generator(vero, env, horizon=2048)

rollout = generator.__next__()
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

In [None]:
obs_dim

In [5]:
actor2.trainable_variables[6]

<tf.Variable 'std:0' shape=(1, 1) dtype=float64, numpy=array([[0.59058199]])>

In [None]:
np.exp(-0.53)