In [23]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow_probability import distributions as dists
import tensorflow.keras.layers as kl
import datetime

from rl_agents.env_utils import rollouts_generator, get_adv_vtarg
from rl_agents.ppo.policy import Actor, Critic
from rl_agents.ppo.agent import PPO_Agent

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

tf.random.set_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard module is not an IPython extension.


In [9]:
env = gym.make('Pendulum-v0')
is_continuous = isinstance(env.action_space, gym.spaces.Box)
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape

In [18]:
actor = Actor(obs_dim, act_dim, is_continuous)
critic = Critic(obs_dim)
vero = PPO_Agent(actor, critic)

In [19]:
generator = rollouts_generator(vero, env, horizon=2048)

In [5]:

rollout = generator.__next__()

In [6]:
advantage, target_value = get_adv_vtarg(rollout, lam=0.95, gamma=0.99)

In [7]:
vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage, epochs=5)

# Training loop

In [20]:
num_ite = 200
lam = 0.95
gamma = 0.99
num_epochs = 10

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [21]:
for i in range(num_ite):
    roll = generator.__next__()
    advantage, target_value = get_adv_vtarg(rollout, lam=lam, gamma=gamma)
    vero.run_epoch(rollout['ob'], rollout['ac'], rollout['log_probs'], target_value, advantage,
                   epochs=num_epochs, summary_writer=train_summary_writer, rewards=roll["ep_rets"], ite=i)
    # rewards = np.array(roll["ep_rets"])
    
    # if i % 10 == 0 or i == num_ite-1:
    #    mean, std = rewards.mean(), rewards.std()
    #    print('mean', mean)
    #    print('std', std)

In [29]:
%tensorboard --logdir logs/gradient_tape

Reusing TensorBoard on port 6006 (pid 15800), started 0:01:13 ago. (Use '!kill 15800' to kill it.)

In [None]:
obs = env.reset()
ac, v, lp = vero.act_stochastic(obs)
print(lp)
l = vero.get_distributions(obs[None])
print(l.log_prob(ac))
print(l.entropy())

In [None]:
ac = env.action_space.sample()
print(ac.dtype)

In [None]:
print(np.float64)

In [None]:
import tensorflow_probability as tfp

In [None]:
tfd = tfp.distributions


In [None]:
dist = tfd.Normal(loc=np.array([0., 1, 2, 0.5]), scale=np.array([0.02, 0.09, 0.1, 1]))

In [None]:
dist.sample()


In [None]:
obs_dim

In [26]:
%load_ext tensorboard.notebook