In [18]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard.notebook extension is already loaded. To reload it, use:
  %reload_ext tensorboard.notebook


## Create GYM environment
Use Pendulum-v0 for now

In [2]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

I0808 17:25:07.914650 4440602048 registration.py:117] Making new env: Pendulum-v0
[2019-08-08 17:25:07,914] Making new env: Pendulum-v0
  result = entry_point.load(False)


In [None]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)
generator = rollouts_generator(vero, env, horizon=2048)

# Training loop

In [None]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [None]:
for i in range(num_ite):
    rollout = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                 epochs=num_epochs)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i*num_epochs)
    
    if i % 50 == 0 or i == num_ite-1:
        actor.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')
    #    mean, std = rewards.mean(), rewards.std()
    #    print('mean', mean)
    #    print('std', std)

In [None]:
%tensorboard --logdir logs/gradient_tape

In [32]:
actor2 = Actor(obs_dim, act_dim, is_continuous)
critic2 = Critic(obs_dim)
vero2 = PPO_Agent(actor2, critic2)
generator2 = rollouts_generator(vero2, env, horizon=2048)

num_ite = 3
lam = 0.95
gamma = 0.99
num_epochs = 2

for i in range(num_ite):
    print('#### iteration ###', i)
    rollout = generator2.__next__()
    # print(rollout['ac'][0:10])
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero2.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                  epochs=num_epochs)

#### iteration ### 0
2047 2048
inputs [[-0.25158372]
 [-0.27841359]
 [-0.27548222]
 [-0.25529173]
 [-0.22943258]]
observations [[ 0.21688466  0.97619724 -0.13388753]
 [ 0.19341716  0.98111661  0.4795629 ]
 [ 0.13378686  0.99101013  1.20909335]
 [ 0.04142577  0.99914158  1.85503192]
 [-0.08228725  0.99660865  2.47636042]]
actions [[-0.79131666]
 [-0.04204668]
 [-0.64879355]
 [-0.85351787]
 [-0.64116204]]
log_probs [[-0.80935528]
 [-0.46956832]
 [-0.59006282]
 [-0.90541784]
 [-0.633589  ]]
lel
0 0
std [[0.58860496967835518]]
inps [[-0.25158372269068879]
 [-0.27841358698289459]
 [-0.27548221932941153]
 [-0.25529173099165159]
 [-0.22943257969088532]]
obss [[0.21688466059080919 0.97619723621838306 -0.13388753070552406]
 [0.19341715873701404 0.9811166101468779 0.47956289681634034]
 [0.13378686312440435 0.99101012873498517 1.2090933527826873]
 [0.041425765853382528 0.99914158452316493 1.8550319161958539]
 [-0.082287249877471158 0.9966086536387303 2.4763604240281216]]
acs [[-0.7913166642794861

2047 6144
inputs [[-0.03004056]
 [-0.06223478]
 [-0.07620416]
 [-0.07443042]
 [-0.05365546]]
observations [[-0.98585811 -0.1675822  -7.45904913]
 [-0.97671071  0.21456045 -7.69236898]
 [-0.83076177  0.55662813 -7.48159437]
 [-0.58474065  0.8112203  -7.11829039]
 [-0.28723137  0.95786123 -6.66450333]]
actions [[-0.71755468]
 [ 0.33236188]
 [-0.36111417]
 [-1.03085443]
 [-0.85184689]]
log_probs [[-1.07131165]
 [-0.61315827]
 [-0.5054004 ]
 [-1.71056192]
 [-1.30916006]]
lel
0 0
std [[0.58807705959670742]]
inps [[-0.035820745057975327]
 [-0.062234778922806537]
 [-0.076204157088502852]
 [-0.074430419191165928]
 [-0.053655457976156777]]
obss [[-0.98585810583315359 -0.16758220419563202 -7.4590491271518129]
 [-0.97671071208851179 0.2145604457758985 -7.6923689822114634]
 [-0.83076177129945983 0.55662813380872511 -7.4815943657406851]
 [-0.58474064937903014 0.81122029866355672 -7.1182903909463846]
 [-0.2872313691915373 0.95786123240913912 -6.6645033315044513]]
acs [[-0.717554679419508]
 [0.332361

In [None]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

In [None]:
ac = env.action_space.sample()
print(ac.dtype)

In [None]:
print(np.float64)

In [None]:
import tensorflow_probability as tfp

In [None]:
tfd = tfp.distributions


In [None]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [None]:
dist.sample()


In [None]:
env.observation_space.sample()

In [None]:
%load_ext tensorboard.notebook

In [None]:
generator = rollouts_generator(vero, env, horizon=2048)

rollout = generator.__next__()
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

In [None]:
obs_dim

In [5]:
actor2.trainable_variables[6]

<tf.Variable 'std:0' shape=(1, 1) dtype=float64, numpy=array([[0.59058199]])>

In [None]:
np.exp(-0.53)