In [1]:
import tensorflow as tf
import gym
import numpy as np
import datetime
from tensorflow_probability import distributions as tfpd, layers as tfpl

tf.random.set_seed(0)
tf.keras.backend.set_floatx('float32')

In [2]:
from tensorflow import keras
from tensorflow.keras import layers as kl

# Continuous case

In [3]:
class GaussianSample(kl.Layer):
    def __init__(self, act_dim):
        super(GaussianSample, self).__init__()
        self.log_std = self.add_weight(
            'log_std', shape=(act_dim,),
            initializer=tf.keras.initializers.Constant(0),
            trainable=True
        )
        self.normal_dist = tfpl.DistributionLambda(
            make_distribution_fn=lambda t: tfpd.Normal(loc=t, scale=tf.exp(self.log_std)),
            convert_to_tensor_fn=lambda s: s.sample(),
        )
    
    #def build(self, input_shape):
    #    """
    #    input_shape: might be [None, act_dim]
    #    """
    #    self.log_std = self.add_weight(
    #        'log_std', initializer=tf.keras.initializers.Constant(-0.53), 
    #        shape=(input_shape[1],), dtype='float32', trainable=True
    #    )

    def call(self, input):
        #return tfpd.Normal(loc=input, scale=tf.exp(self.log_std))
        return self.normal_dist(input)

Should the output layer have a non linear activation? 

NO, [spinningup](https://github.com/openai/spinningup/blob/master/spinup/examples/pytorch/pg_math/1_simple_pg.py#L9) uses an identity output activation


In [4]:
def get_continuous_actor(obs_dim: int, act_dim: int):
    obs = keras.Input(shape=(obs_dim,), name='observations')
    x = kl.Dense(32, activation='tanh', name='dense_1')(obs)
    x = kl.Dense(act_dim, name='logits')(x)
    pi = GaussianSample(act_dim)(x)
    
    model = keras.Model(inputs=obs,
                        outputs=pi)
    
    return model

## Test model

In [5]:
test_model = get_actor(3, 1)

The following Variables were used a Lambda layer's call (distribution_lambda), but
are not present in its tracked objects:
  <tf.Variable 'log_std:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [9]:
test_model.layers[3].log_std.numpy()

array([-0.53], dtype=float32)

In [65]:
obs = np.array([[ 0.69950885,  0.6432279 , -7.588761  ]])
res = test_model(obs)
print(res.sample()[0].numpy())



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[-0.58579165]


# Discrete case

In [31]:
a = tfpd.Categorical(logits=[[2, 2, 3], [2, -2, 3]])

In [37]:
a.log_prob([2, 2]).numpy()

array([-0.5514447, -0.3181754], dtype=float32)

In [12]:
def get_discrete_actor(obs_dim: int, act_dim: int, size: int=64):
    obs = keras.Input(shape=(obs_dim,), name='observations')
    x = kl.Dense(size, activation='tanh', name='dense_1')(obs)
    x = kl.Dense(act_dim, name='logits')(x)

    pi = tfpl.DistributionLambda(
        make_distribution_fn=lambda t: tfpd.Categorical(logits=t),
        convert_to_tensor_fn=lambda s: s.sample(),
    )(x)
    
    model = keras.Model(inputs=obs,
                        outputs=pi)
    
    return model

## Test model

In [43]:
test_model = get_discrete_actor(3, 2)

In [61]:
obs = np.array([[ 0.69950885,  0.6432279 , -1.588761  ]])
res = test_model(obs)
print(res.sample()[0].numpy())
print(res.log_prob([1]))

1
tf.Tensor([-0.9112284], shape=(1,), dtype=float32)


# Integration test model + environment

In [6]:
def test_model_in_env(model, env):
    obs = env.reset()
    ret = 0
    while True:
        act = model(obs[None]).sample()[0].numpy()
        obs, rew, done, _ = env.step(act)

        ret += rew

        if done:
            break

    return ret

## Discrete case

In [8]:
discrete_env = gym.make('CartPole-v0')

model = get_discrete_actor(
    discrete_env.observation_space.shape[0], 
    discrete_env.action_space.n
)

In [21]:
ret = test_model_in_env(model, discrete_env)
print(f'Overall return {ret}')

Overall return 17.0


In [7]:
def get_opt_fn(model, lr=3e-4):
    opt = tf.keras.optimizers.Adam(lr)
    
    #@tf.function
    def step_fn(obs_no, act_na, adv_n):
        with tf.GradientTape() as tape:
            logp = model(obs_no).log_prob(act_na)
            # print(logp.shape)
            if len(logp.shape) > 1:
                logp =  tf.reduce_sum(logp, axis=1)
                
            weights = tf.stop_gradient(tf.squeeze(adv_n))
            
            #print(logp.dtype, adv_n.dtype)
            loss = tf.reduce_mean(- logp * weights)
        
        grad = tape.gradient(loss, model.trainable_variables)
        opt.apply_gradients(zip(grad, model.trainable_variables))
        
        return loss
        
    return opt, step_fn

In [8]:
def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    return rtgs

In [13]:
def train(env, epochs=200, buffer_size=4096, lr=5e-3, size=32):
    assert isinstance(env.observation_space, gym.spaces.Box), \
        "This is only for continuous observation space"
    
    obs_dim = env.observation_space.shape[0]
    model = None
    if isinstance(env.action_space, gym.spaces.Box):
        model = get_continuous_actor(obs_dim, env.action_space.shape[0])
    else:
        model = get_discrete_actor(obs_dim, env.action_space.n, size=size)

    model.summary()
    # keras.utils.plot_model(model, 'multi_output_model.png')
    
    
    opt, step_fn = get_opt_fn(model, lr=lr)
    
    def train_one_epoch():
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for reward-to-go weighting in policy gradient
        batch_rets = []         # for measuring episode returns
        batch_lens = []         # for measuring episode lengths

        # reset episode-specific variables
        obs = env.reset()       # first obs comes from starting distribution
        done = False            # signal from environment that episode is over
        ep_rews = []            # list for rewards accrued throughout ep

        # collect experience by acting in the environment with current policy
        while True:
            # save obs
            batch_obs.append(obs.copy())

            # act in the environment
            act = model(obs[None]).sample()[0].numpy()
            obs, rew, done, _ = env.step(act)

            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew)

            if done:
                # if episode is over, record info about episode
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)

                # the weight for each logprob(a_t|s_t) is reward-to-go from t
                batch_weights += list(reward_to_go(ep_rews))

                # reset episode-specific variables
                obs, done, ep_rews = env.reset(), False, []

                # end experience loop if we have enough of it
                if len(batch_obs) > buffer_size:
                    break

        batch_weights = np.array(batch_weights).astype('float32')
        weights_mean, weights_std = batch_weights.mean(), batch_weights.std()
        batch_weights = (batch_weights - weights_mean) / (weights_std + 1e-5)
        # take a single policy gradient update step
        batch_loss = step_fn(np.array(batch_obs).astype('float32'), 
                             np.array(batch_acts).astype('float32'), 
                             batch_weights)
        return batch_rets, batch_lens, batch_loss
    
    for i in range(epochs):
        batch_rets, batch_lens, batch_loss = train_one_epoch()
        #print(model.layers[3].log_std.numpy())
        if i % 50 == 0 or i == epochs-1:
            model.save_weights('ckpts/_model_'+str(i), save_format='tf')
        
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
              (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))

# Training

In [10]:
env = gym.make('CartPole-v0')
print('Trainning on Discrete envirionment')
train(env, epochs=200, lr=5e-3)

Trainning on Discrete envirionment
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
observations (InputLayer)    [(None, 4)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                160       
_________________________________________________________________
logits (Dense)               (None, 2)                 66        
_________________________________________________________________
distribution_lambda (Distrib ((None,), (None,))        0         
Total params: 226
Trainable params: 226
Non-trainable params: 0
_________________________________________________________________
epoch:   0 	 loss: 0.029 	 return: 18.168 	 ep_len: 18.168
epoch:   1 	 loss: 0.017 	 return: 18.772 	 ep_len: 18.772
epoch:   2 	 loss: 0.007 	 return: 21.391 	 ep_len: 21.391
epoch:   3 	 loss: 0.001 	 return: 21.698 	 ep_le

KeyboardInterrupt: 

In [14]:
env = gym.make('MountainCar-v0')
print('Trainning on Discrete envirionment')
train(env, epochs=200, lr=1e-3, size=128)

Trainning on Discrete envirionment
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
observations (InputLayer)    [(None, 2)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                192       
_________________________________________________________________
logits (Dense)               (None, 3)                 195       
_________________________________________________________________
distribution_lambda_2 (Distr ((None,), (None,))        0         
Total params: 387
Trainable params: 387
Non-trainable params: 0
_________________________________________________________________
epoch:   0 	 loss: -0.000 	 return: -200.000 	 ep_len: 200.000
epoch:   1 	 loss: -0.001 	 return: -200.000 	 ep_len: 200.000
epoch:   2 	 loss: 0.000 	 return: -200.000 	 ep_len: 200.000
epoch:   3 	 loss: 0.002 	 return: -

epoch: 117 	 loss: 0.028 	 return: -200.000 	 ep_len: 200.000
epoch: 118 	 loss: 0.001 	 return: -200.000 	 ep_len: 200.000
epoch: 119 	 loss: 0.008 	 return: -200.000 	 ep_len: 200.000
epoch: 120 	 loss: 0.019 	 return: -200.000 	 ep_len: 200.000
epoch: 121 	 loss: -0.003 	 return: -200.000 	 ep_len: 200.000


KeyboardInterrupt: 

# Evaluation

In [None]:
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def load_saved_model(path, obs_dim, act_dim):
    model = get_actor(obs_dim, act_dim)
    model.load_weights(path)
    return model

In [None]:
def evaluate_model(model, env):
    assert isinstance(env.observation_space, Box), \
        "This is only for continuous observation space"
    assert isinstance(env.action_space, Box), \
        "This is only for continuous action space"

    env.reset()
    img = plt.imshow(env.render(mode='rgb_array')) # only call this once

    for _ in range(100):
        img.set_data(env.render(mode='rgb_array')) # just update the data
        display.display(plt.gcf())
        display.clear_output(wait=True)
        action = model(obs[None]).loc[0]
        env.step(action)

In [1]:

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

NameError: name 'env' is not defined

In [9]:
class MyDenseLayer(tf.keras.layers.Layer):
    def __init__(self, num_outputs):
        super(MyDenseLayer, self).__init__()
        self.num_outputs = num_outputs

    def build(self, input_shape):
        print(input_shape)
        self.kernel = self.add_weight("kernel",
                                  shape=[int(input_shape[-1]),
                                         self.num_outputs])

    def call(self, input):
        return tf.matmul(input, self.kernel)

layer = MyDenseLayer(10)

In [10]:
_ = layer(tf.zeros([10, 5])) # Calling the layer `.builds` it.

(10, 5)


In [11]:
print([var.shape for var in layer.trainable_variables])

[TensorShape([5, 10])]
