In [1]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [2]:
problem = "Pendulum-v1"
env = gym.make(problem, render_mode = "human")

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [3]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

# Buffer

Pour implémenter l'algorithme DDPG, il faut utiliser un buffer. Le buffer est un élément important et utile dans l'algorithme DDPG car il permet d'éviter les problèmes de corrélation entre les transitions, améliore la stabilité de l'apprentissage et permet de minimiser les effets de l'instabilité des mises à jour.

In [4]:
class ReplayBuffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.buffer_counter += 1

    # Return a batch from the buffer
    def getBatch(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices], dtype = tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        return state_batch, action_batch, reward_batch, next_state_batch



# Acteur et Critique

In [10]:
class Actor : 
    def __init__(self, lr, tau = 0.005):
        self.lr = lr
        self.tau = tau
        self.model = self.make_actor()
        self.target = self.make_actor()
        self.optimizer = tf.keras.optimizers.Adam(lr)
        # Making the weights equal initially
        self.target.set_weights(self.model.get_weights())

    def make_actor(self):
        # Initialize weights between -3e-3 and 3-e3
        last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

        inputs = layers.Input(shape=(num_states,))
        out = layers.Dense(256, activation="relu")(inputs)
        out = layers.Dense(256, activation="relu")(out)
        outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

        # Our upper bound is 2.0 for Pendulum.
        outputs = outputs * upper_bound
        model = tf.keras.Model(inputs, outputs)
        return model
    
    def policy(self, state, noise_object):
        sampled_actions = tf.squeeze(self.model(state))
        #noise = noise_object()
        # Adding noise to action
        #sampled_actions = sampled_actions.numpy() + noise
        sampled_actions = sampled_actions.numpy() +  np.array([np.random.randn()*0.25])


        # We make sure action is within bounds
        legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

        return [np.squeeze(legal_action)]
    
    def update(self, state_batch, action_batch, reward_batch, next_state_batch, critic):
        with tf.GradientTape() as tape:
            actions = self.model(state_batch, training=True)
            critic_value = critic.model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(actor_grad, self.model.trainable_variables)
        )
        
    # This update target parameters slowly
    # Based on rate `tau`, which is much less than one.
    @tf.function
    def update_target():
        for (a, b) in zip(self.target.variables, self.model.variables):
            a.assign(b * self.tau + a * (1 - self.tau))

class Critic:
    def __init__(self, lr, tau = 0.005):
        self.lr = lr
        self.tau = tau
        self.model = self.make_critic()
        self.target = self.make_critic()
        self.optimizer = tf.keras.optimizers.Adam(lr)

        # Making the weights equal initially
        self.target.set_weights(self.model.get_weights())


    def make_critic(self):
        # State as input
        state_input = layers.Input(shape=(num_states))
        state_out = layers.Dense(16, activation="relu")(state_input)
        state_out = layers.Dense(32, activation="relu")(state_out)

        # Action as input
        action_input = layers.Input(shape=(num_actions))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Both are passed through seperate layer before concatenating
        concat = layers.Concatenate()([state_out, action_out])

        out = layers.Dense(256, activation="relu")(concat)
        out = layers.Dense(256, activation="relu")(out)
        outputs = layers.Dense(1)(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model
    
    def update(self, state_batch, action_batch, reward_batch, next_state_batch, actor):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = actor.target(next_state_batch, training=True)
            y = reward_batch + gamma * self.target(
                [next_state_batch, target_actions], training=True
            )
            critic_value = self.model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(critic_grad, self.model.trainable_variables)
        )
    # This update target parameters slowly
    # Based on rate `tau`, which is much less than one.
    @tf.function
    def update_target():
        for (a, b) in zip(self.target.variables, self.model.variables):
            a.assign(b * self.tau + a * (1 - self.tau))

# Initialisation des paramètres

In [11]:
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

# Learning rate for actor-critic models
critic_lr = 0.002
actor_lr = 0.001

total_episodes = 100
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
tau = 0.005

buffer = ReplayBuffer(50000, 64)
actor = Actor(actor_lr, tau)
critic = Critic(critic_lr, tau)

In [12]:
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

std_dev = 0.5
start_steps = 300
max_steps = 10000
# Takes about 4 min to train
for ep in range(total_episodes):

    prev_state, _ = env.reset()
    episodic_reward = 0
    step = 0
    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.
        # env.render()

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = actor.policy(tf_prev_state, ou_noise)
        
        state, reward, done, truncated, info = env.step(action)

        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        
        state_batch, action_batch, reward_batch, next_state_batch = buffer.getBatch()
        critic.update(state_batch, action_batch, reward_batch, next_state_batch, actor)
        actor.update(state_batch, action_batch, reward_batch, next_state_batch, critic)

        
        update_target(actor.target.variables, actor.model.variables, tau)
        update_target(critic.target.variables, critic.model.variables, tau)

        # End this episode when `done` is True
        if done or truncated:
            break

        prev_state = state
        step += 1

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

2023-03-19 00:11:32.937032: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-19 00:11:32.943793: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-19 00:11:33.044532: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Episode * 0 * Avg Reward is ==> -1274.0621085834405
Episode * 1 * Avg Reward is ==> -1352.0426915460303
Episode * 2 * Avg Reward is ==> -1476.45601246527
Episode * 3 * Avg Reward is ==> -1497.6501037370915
Episode * 4 * Avg Reward is ==> -1490.8673294394703
Episode * 5 * Avg Reward is ==> -1495.8463528843538
Episode * 6 * Avg Reward is ==> -1479.3608997260742
Episode * 7 * Avg Reward is ==> -1484.692067012213
Episode * 8 * Avg Reward is ==> -1437.816638500086
Episode * 9 * Avg Reward is ==> -1373.6004297178654
Episode * 10 * Avg Reward is ==> -1320.7125349316145
Episode * 11 * Avg Reward is ==> -1290.8863671071265
Episode * 12 * Avg Reward is ==> -1243.9592661528375
Episode * 13 * Avg Reward is ==> -1174.222391985658
Episode * 14 * Avg Reward is ==> -1113.8244672145163
Episode * 15 * Avg Reward is ==> -1052.7794586045886
Episode * 16 * Avg Reward is ==> -1012.9500965090944
Episode * 17 * Avg Reward is ==> -963.8590143787509
Episode * 18 * Avg Reward is ==> -919.8411018540697
Episode * 


KeyboardInterrupt



Jouons une partie 

In [None]:
state, _ = env.reset()
state =  tf.expand_dims(tf.convert_to_tensor(state), 0)
while True : 
    action = policy(state, ou_noise)
    state, reward, done, truncated, info = env.step(action)
    state =  tf.expand_dims(tf.convert_to_tensor(state), 0)
    if done or truncated : 
        break
    

In [None]:
# Save the weights
actor_model.save_weights("pendulum_actor.h5")
critic_model.save_weights("pendulum_critic.h5")

target_actor.save_weights("pendulum_target_actor.h5")
target_critic.save_weights("pendulum_target_critic.h5")