In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import gym

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.test.is_built_with_cuda()

Num GPUs Available:  1


True

In [27]:
class Critic:

    def __init__(self, input_shape, learning_rate, gradient_clipping):
        self.optimizer = keras.optimizers.Adam(learning_rate)
        self.gradient_clipping = gradient_clipping
        self._create_model(input_shape)


    @classmethod
    def test(cls, input_shape):
        critic = cls.__new__(cls)
        critic._create_model(input_shape)
        return critic


    @classmethod
    def clone(cls, critic):
        target = cls.__new__(cls)
        critic_model = critic.model
        target.model = keras.models.clone_model(critic_model)
        target.model.set_weights(critic_model.get_weights())
        return target


    def _create_model(self, input_shape):
        state_action_input = keras.Input(input_shape)
        dense_1_out = keras.layers.Dense(units = 64, activation = 'relu')(state_action_input)
        dense_2_out = keras.layers.Dense(units = 64, activation = 'relu')(dense_1_out)
        q_value = keras.layers.Dense(units = 1, activation = 'linear')(dense_2_out)

        self.model = keras.Model(state_action_input, q_value)


    def __call__(self, states_actions):
        dense_1_out = self.dense_1(states_actions)
        dense_2_out = self.dense_2(dense_1_out)
        q_value = self.q_value(dense_2_out)
        return q_value


    def update(self, gradients):
        gradients, _ = tf.clip_by_global_norm(gradients, self.gradient_clipping)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))


    def trainable_variables(self):
        return self.model.trainable_variables

    
    def get_weights(self):
        return self.model.get_weights()

    
    def set_weights(self, weights):
        return self.model.set_weights(weights)

In [4]:
import gym
env = gym.make("BipedalWalkerHardcore-v3")
print(env.action_space.shape)
print(env.action_space.sample())
print(env.observation_space.sample())

import numpy as np
np.concatenate()

(4,)
[-0.65622467  0.6138866   0.561058   -0.4359544 ]
[-0.1520767  -0.24884976  0.64749384  0.9371436   0.3619809  -0.34786075
  0.42787895 -0.34916872 -0.22753696 -0.65975386  0.5740431   0.41449204
  0.68966895 -1.4206433  -0.7475852  -1.309562    0.5587635   0.1904682
 -0.5049067   0.69959706  0.6894707   2.4412012   1.3946751   0.60392267]


In [9]:
from tensorflow import keras
import tensorflow as tf
x = keras.Input(env.action_space.shape)
y = keras.Input(env.observation_space.shape)

z = tf.concat([x, y], axis=1)
z.shape

TensorShape([None, 28])

In [4]:
x = tf.constant(np.ones((24,)))
y = tf.constant(np.ones((4,)))

z = tf.expand_dims(tf.concat([x, y], axis=0), axis=1)
print(z.shape)

(28, 1)


In [6]:
critic = Critic((24, ), 1e-4, 0.5)
critic.model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 24)]              0         
                                                                 
 dense (Dense)               (None, 64)                1600      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,825
Trainable params: 5,825
Non-trainable params: 0
_________________________________________________________________


In [18]:
target = keras.models.clone_model(critic.model)

target.set_weights(critic.model.get_weights())

In [21]:
target.summary()

[np.all(w1 == w2) for (w1, w2) in zip(critic.model.get_weights(), target.get_weights())]

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 24)]              0         
                                                                 
 dense (Dense)               (None, 64)                1600      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,825
Trainable params: 5,825
Non-trainable params: 0
_________________________________________________________________


[True, True, True, True, True, True]

In [None]:
class Actor(keras.Model):
    
    def __init__(self, learning_rate, gradient_clipping, action_size, min_action, max_action):
        super().__init__()
        self.optimizer = keras.optimizers.Adam(learning_rate)
        self.gradient_clipping = gradient_clipping
        self.action_size = action_size
        self.min_action = min_action
        self.max_action = max_action
        self.create_layers()


    @classmethod
    def test(cls, action_size, min_action, max_action):
        actor = cls.__new__(cls)
        super(Actor, actor).__init__()
        actor.action_size = action_size
        actor.min_action = min_action
        actor.max_action = max_action
        actor.create_layers()
        return actor

    
    def create_layers(self):
        self.dense_1 = keras.layers.Dense(units = 64, activation = 'relu')
        self.dense_2 = keras.layers.Dense(units = 64, activation = 'relu')
        self.mean = keras.layers.Dense(units = self.action_size, activation = 'tanh')
        self.log_std = keras.layers.Dense(units = self.action_size, activation = 'linear')


    def common(self, states):
        dense_1_out = self.dense_1(states)
        dense_2_out = self.dense_2(dense_1_out)
        means = self.mean(dense_2_out)
        log_stds = self.log_std(dense_2_out)
        stds = tf.exp(log_stds)

        normal_distribs = MultivariateNormalDiag(means, stds)
        return normal_distribs


    def _rescale_actions(self, actions):
        actions = self.min_action + (actions + 1.0)*(self.max_action - self.min_action)/2.0
        return actions


    def call(self, states):
        normal_distribs = self.common(states)
        unbound_actions = normal_distribs.sample()
        actions = tf.tanh(unbound_actions)
        actions = self._rescale_actions(actions)
        return actions


    def compute_log_of_tensor(self, values):
        offsets = tf.cast(values == 0, dtype = tf.float32)*1e-6
        values = values + offsets
        return tf.math.log(values)


    def call_update(self, states):
        normal_distribs = self.common(states)

        unbound_actions = normal_distribs.sample()
        actions = tf.tanh(unbound_actions)
        actions = self._rescale_actions(actions)

        unbound_actions_probs = normal_distribs.probs()
        unbound_actions_log_probs = self.compute_log_of_tensor(unbound_actions_probs)

        batch_size = unbound_actions.shape[0]
        constant_term = tf.repeat(tf.expand_dims(tf.math.log(1e-6 + (self.max_action - self.min_action)/2.0), axis = 0), 
            batch_size, axis = 0)
        log_jacobian_determinant = tf.reduce_sum(constant_term + tf.math.log(1 - tf.tanh(unbound_actions)**2 + 1e-6), 
            axis = -1)
    
        actions_log_prob = unbound_actions_log_probs - log_jacobian_determinant
        return actions, actions_log_prob


    def update(self, gradients):
        gradients, _ = tf.clip_by_global_norm(gradients, self.gradient_clipping)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

In [4]:
import gym
env = gym.make('BipedalWalker-v3')
for i_episode in range(20):
    observation = env.reset()
    total = 0
    done = False
    while not done:
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        #print(reward)
        total += reward
        if done:
            print(f'Episode finished: {total}')
            total = 0
            break
env.close()

-0.1938081510197371
-0.08122685325145722
-0.06351888388892252
0.04977377707511186
-0.006018035834964422
-0.10390292193150885
-0.18379183035592
-0.13525280350695174
-0.10801611152787886
-0.23553374954064926
-0.25460299154619137
-0.252407476862274
-0.22340440947313983
-0.1528423517743758
-0.06347910932699602
-0.020839374393224717
-0.06257417488098144
0.0010088715155907556
0.10626020556688306
-0.0038788227637597053
0.03825471013784408
-0.027047816604375843
-0.16762510033448774
-0.3205860476891176
-0.27009359532222155
-0.3515026150544508
-0.31410046660900115
-0.334265335559845
-0.13997719014265025
-0.24537440321842946
-0.09683538173635682
-0.07668610231081764
-0.02063038057585439
-0.08611282889048377
-0.0160370970169715
-0.04621976415316027
0.039387951870758144
-0.05419158657391748
-0.11764422622323037
-0.1659227010409049
-0.014205719014007655
-0.001211554606757946
-0.009628362834453584
-0.0012915123999118778
0.06264785667260724
-0.08440672750771046
-0.1460127458175047
-0.15940005997816484