In [1]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

tf.random.set_seed(0)

## Create GYM environment
Use Pendulum-v0 for now

In [2]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

In [3]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)
generator = rollouts_generator(vero, env, horizon=2048)

# Training loop

In [4]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [5]:
for i in range(num_ite):
    rollout = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_ite(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                 epochs=num_epochs)
    with train_summary_writer.as_default():
        tf.summary.scalar('reward mean', np.array(rollout["ep_rets"]).mean(), step=i*num_epochs)
    
    if i % 50 == 0 or i == num_ite-1:
        actor.save_weights(train_log_dir+'/_actor_'+str(i), save_format='tf')
        critic.save_weights(train_log_dir+'/_critic_'+str(i), save_format='tf')
    #    mean, std = rewards.mean(), rewards.std()
    #    print('mean', mean)
    #    print('std', std)
    


In [1]:
%tensorboard --logdir logs/gradient_tape

UsageError: Line magic function `%tensorboard` not found.


In [None]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

In [None]:
ac = env.action_space.sample()
print(ac.dtype)

In [None]:
print(np.float64)

In [None]:
import tensorflow_probability as tfp

In [None]:
tfd = tfp.distributions


In [None]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [None]:
dist.sample()


In [11]:
env.observation_space.sample()

array([-0.4457868 , -0.91729486,  5.1830072 ], dtype=float32)

In [None]:
%load_ext tensorboard.notebook

In [None]:
generator = rollouts_generator(vero, env, horizon=2048)

rollout = generator.__next__()
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

In [12]:
obs_dim

(3,)

In [10]:
actor.trainable_variables

[<tf.Variable 'Actor/dense/kernel:0' shape=(3, 32) dtype=float64, numpy=
 array([[-0.23775086,  0.0865082 ,  0.18968251,  0.38295697,  0.51894917,
         -0.15761821,  0.08655264,  0.14825048,  0.50769685,  0.05912139,
          0.1884715 , -0.05413917, -0.42845413, -0.23050986,  0.42611479,
          0.31040875,  0.39117041, -0.15224479,  0.42801437, -0.12993485,
         -0.38280668, -0.1330051 ,  0.53515256, -0.37260987,  0.01727576,
         -0.12489494,  0.27695066, -0.03321982, -0.43405681,  0.45421999,
         -0.25787384, -0.20653047],
        [ 0.11198237, -0.04425309, -0.05990919,  0.30799883,  0.23335211,
          0.10551802,  0.01627948,  0.40033144,  0.4581129 ,  0.04297429,
         -0.30024867, -0.1294846 , -0.12120222, -0.32832094,  0.2620834 ,
          0.19250278,  0.05132553, -0.19108716,  0.1292169 , -0.22103806,
          0.44025438,  0.28718562,  0.30438951, -0.51267301,  0.28971121,
         -0.29629225,  0.03672055, -0.18913436,  0.21002218, -0.32613824,
   

In [11]:
np.exp(-0.53)

0.5886049696783552