In [6]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard.notebook extension is already loaded. To reload it, use:
  %reload_ext tensorboard.notebook


## Create GYM environment
Use Pendulum-v0 for now

In [5]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

I0812 19:58:11.275629 4511135168 registration.py:117] Making new env: Pendulum-v0
[2019-08-12 19:58:11,275] Making new env: Pendulum-v0


In [None]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)
generator = rollouts_generator(vero, env, horizon=2048)

# Training loop

In [None]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [None]:
for i in range(num_ite):
    rollout = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                 epochs=num_epochs)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i*num_epochs)
    
    if i % 50 == 0 or i == num_ite-1:
        actor.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')
    #    mean, std = rewards.mean(), rewards.std()
    #    print('mean', mean)
    #    print('std', std)

In [None]:
%tensorboard --logdir logs/gradient_tape

In [19]:
actor2 = Actor(obs_dim, act_dim, is_continuous)
critic2 = Critic(obs_dim)
vero2 = PPO_Agent(actor2, critic2)
generator2 = rollouts_generator(vero2, env, horizon=2048)

num_ite = 3
lam = 0.95
gamma = 0.99
num_epochs = 2

for i in range(num_ite):
    print('#### iteration ###', i)
    rollout = generator2.__next__()
    # print(rollout['ac'][0:10])
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero2.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], rollout['locs'], target_value, advantage,
                  epochs=num_epochs)

#### iteration ### 0
2047 2048
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]
0 0
------ batch ------
observations [[-0.24149321 -0.97040251 -0.22568111]
 [-0.28941437 -0.95720391 -0.99421319]
 [-0.36668206 -0.93034632 -1.63650357]
 [-0.47340652 -0.88084406 -2.35427933]
 [-0.60271278 -0.79795821 -3.07484586]]
locs [[0.26819828]
 [0.26771676]
 [0.23987118]
 [0.19866958]
 [0.1552326 ]]
actions [[-0.27153466]
 [ 0.50408367]
 [-0.13344015]
 [-0.39955656]
 [-0.25649686]]
log_probs [[-0.80935528]
 [-0.46956832]
 [-0.59006282]
 [-0.90541784]
 [-0.633589  ]]
(64, 3)
(64, 1)
(64, 1)
------ batch ------
std [[0.58860496967835518]]
obss [[-0.24149321113391706 -0.97040250874378375 -0.22568110831762733]
 [-0.28941437298626715 -0.95720390759177632 -0.99421318950328974]
 [-0.36668205603034926 -0.93034631712355154 -1.6365035690451957]
 [-0.473406522711623

2047 4096
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]
0 0
------ batch ------
observations [[-0.15732197 -0.98754737 -5.53715637]
 [-0.45668141 -0.88963031 -6.32566103]
 [-0.73297514 -0.68025543 -6.96847913]
 [-0.93008353 -0.36734809 -7.43909818]
 [-0.99995838  0.00912371 -7.70559978]]
locs [[ 0.07789185]
 [-0.01823461]
 [-0.04607956]
 [-0.07306884]
 [-0.10088429]]
actions [[-0.3189609 ]
 [ 0.1626976 ]
 [ 0.26381678]
 [ 0.06006313]
 [-0.06824434]]
log_probs [[-0.61622886]
 [-0.43562046]
 [-0.52708671]
 [-0.41392791]
 [-0.38985633]]
(64, 3)
(64, 1)
(64, 1)
------ batch ------
std [[0.58823918547436094]]
obss [[-0.15732196506209445 -0.98754736560278522 -5.5371563733061073]
 [-0.4566814118663699 -0.88963030976678115 -6.3256610320456632]
 [-0.73297513788991808 -0.68025542793516591 -6.96847912504342]
 [-0.93008353396187471 -0.367348090852518

2047 6144
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]
0 0
------ batch ------
observations [[ 0.87350563 -0.48681405  1.39133818]
 [ 0.89535086 -0.44536147  0.93721559]
 [ 0.91000304 -0.41460157  0.68146022]
 [ 0.91704715 -0.3987788   0.34640269]
 [ 0.9154962  -0.40232662 -0.07744027]]
locs [[0.09407685]
 [0.1272014 ]
 [0.12418182]
 [0.12463402]
 [0.13082138]]
actions [[-0.59341369]
 [ 0.52177154]
 [-0.16070904]
 [-0.83172572]
 [-0.66731641]]
log_probs [[-1.07127734]
 [-0.61309106]
 [-0.50533319]
 [-1.71049471]
 [-1.30909285]]
(64, 3)
(64, 1)
(64, 1)
------ batch ------
std [[0.58803753876703835]]
obss [[0.873505626632184 -0.486814050990638 1.3913381824825637]
 [0.89535086009095111 -0.44536146817432931 0.93721559119604336]
 [0.91000304274194455 -0.4146015704268407 0.68146022113940519]
 [0.917047147457887 -0.3987787975047728 0.3464026867

In [None]:
actor3 = Actor(obs_dim, act_dim, is_continuous)
critic3 = Critic(obs_dim)
vero3 = PPO_Agent(actor3, critic3)

num_ite = 3
lam = 0.95
gamma = 0.99

for i in range(num_ite):
    print('#### iteration ###', i)
    
    rollout = generator2.__next__()
    # print(rollout['ac'][0:10])
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero2.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                  epochs=num_epochs)

In [None]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

In [None]:
ac = env.action_space.sample()
print(ac.dtype)

In [None]:
print(np.float64)

In [None]:
import tensorflow_probability as tfp

In [None]:
tfd = tfp.distributions


In [None]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [None]:
dist.sample()


In [None]:
env.observation_space.sample()

In [None]:
%load_ext tensorboard.notebook

In [None]:
generator = rollouts_generator(vero, env, horizon=2048)

rollout = generator.__next__()
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

In [None]:
obs_dim

In [5]:
actor2.trainable_variables[6]

<tf.Variable 'std:0' shape=(1, 1) dtype=float64, numpy=array([[0.59058199]])>

In [None]:
np.exp(-0.53)