In [1]:
import tensorflow as tf
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.models import Sequential
import tensorflow_probability as tfp

import numpy as np
import random
import gym
import matplotlib.pyplot as plt

from collections import deque

# CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING: https://arxiv.org/pdf/1509.02971 
# https://github.com/abhisheksuran/Reinforcement_Learning/blob/master/DDPGwithtau.ipynb




In [2]:
import gym
env = gym.make("LunarLander-v2", continuous=True, render_mode='human')

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

print(state_size, action_size)

8 2


In [3]:
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = np.random.uniform(low=-1, high=1, size=(4,))
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

  if not isinstance(terminated, (bool, np.bool8)):


In [4]:
class Agent():
    def __init__(self, state_size, action_size, lr_actor=0.001, lr_critic=0.001,
                 gamma=0.99, batch_size=64, buffer_size=500000, min_start=1000) -> None:
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        
        self.buffer = deque(maxlen=self.buffer_size)
        self.min_start = min_start
        
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        
        self.actor_main = self.build_actor_network()
        self.actor_target = self.build_actor_network()
        
        self.critic_main = self.build_critic_network()
        self.critic_target = self.build_critic_network()
        
        self.update_target(tau=1.0)
        
        self.opt_actor = tf.keras.optimizers.legacy.Adam(learning_rate=self.lr_actor)
        self.opt_critic = tf.keras.optimizers.legacy.Adam(learning_rate=self.lr_critic)
        
        self.min_action = -1
        self.max_action = 1
        
        self.train_step = 0
        self.replace_step = 10


    def build_actor_network(self):
        # The DNN model used for estimating the action probability using the input as state
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size, activation='relu'))
        # model.add(Dropout(0.5))
        # model.add(BatchNormalization())
        model.add(Dense(128, activation='relu'))
        # model.add(Dropout(0.5))
        # model.add(BatchNormalization())
        model.add(Dense(self.action_size, activation='tanh'))
        
        return model
    
    def build_critic_network(self):
        # The DNN model used for estimating the value function
        # The input is [state, action], output is the Q(s, a)
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size + self.action_size, activation='relu'))
        # model.add(Dropout(0.5))
        # model.add(BatchNormalization())
        model.add(Dense(128, activation='relu'))
        # model.add(Dropout(0.5))
        # model.add(BatchNormalization())
        model.add(Dense(1, activation='linear'))

        return model
    
    def select_action(self, state, evaluate=False):
        actions = self.actor_main(state)
        if not evaluate:
            actions += np.random.normal(0, 0.1, (1, action_size))
        actions = np.clip(actions, self.min_action, self.max_action)
        
        return actions[0]
    
    def store_data(self, state, action, reward, next_state, done):
    # Store the data in the buffer
        if len(self.buffer) == self.min_start:
            print("Collect enough samples, training starting")
        elif len(self.buffer) == self.buffer_size:
            print("Buffer memory is full. Discarding first half and moving the rest.")
            # Calculate the number of elements to discard
            num_to_discard = len(self.buffer) // 2
            # Discard the first half of the buffer
            for _ in range(num_to_discard):
                self.buffer.popleft()

        # Append the new data to the buffer
        self.buffer.append([state, action, reward, next_state, done])
    
    def update_target(self, tau=0.005):
        # Iterate through the weights of the target and main models
        for target_weights, main_weights in zip(self.actor_target.weights, self.actor_main.weights):
            # Update the target model weights with a soft update
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)
        
        for target_weights, main_weights in zip(self.critic_target.weights, self.critic_main.weights):
            # Update the target model weights with a soft update
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)
        # self.actor_target.set_weights(self.actor_main.get_weights())
        # self.critic_target.set_weights(self.critic_main.get_weights())

    
    def learn(self):
        if len(self.buffer) < self.min_start:
            return
        # sample a minibatch from the buffer
        minibatch = random.sample(self.buffer, min(self.buffer_size, self.batch_size))
        states, actions, rewards, next_states, dones = zip(*minibatch)
        
        # convert to tensor, we want action to be integer
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)
        
        minibatch = random.sample(self.buffer, min(self.buffer_size, self.batch_size))
        states, actions, rewards, next_states, dones = zip(*minibatch)

        # convert to tensor, we want action to be integer
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)

        with tf.GradientTape() as tape1:
            actions_next_states = self.actor_target(tf.squeeze(next_states))
            Q_value_next_states = tf.squeeze(self.critic_target(tf.concat([tf.squeeze(next_states), actions_next_states], axis=1)))
            y = rewards + self.gamma * Q_value_next_states * (1 - dones)
            
            Q_value_current_states = tf.squeeze(self.critic_main(tf.concat([tf.squeeze(states), actions], axis=1)))
            critic_loss = tf.keras.losses.MSE(y, Q_value_current_states)
            
        with tf.GradientTape() as tape2:
            new_actions = self.actor_main(tf.squeeze(states))
            actor_loss = tf.squeeze(self.critic_main(tf.concat([tf.squeeze(states), new_actions], axis=1)))
            actor_loss = - tf.reduce_mean(actor_loss)
        
        grads1 = tape1.gradient(critic_loss, self.critic_main.trainable_variables)
        self.opt_critic.apply_gradients(zip(grads1, self.critic_main.trainable_variables))
        
        grads2 = tape2.gradient(actor_loss, self.actor_main.trainable_variables)
        self.opt_actor.apply_gradients(zip(grads2, self.actor_main.trainable_variables))
        
        if self.train_step % self.replace_step == 0:
            self.update_target()
        self.train_step += 1

In [5]:
# training
# env = gym.make('BipedalWalker-v3', hardcore=False, render_mode='human')
# env = gym.make('BipedalWalker-v3', hardcore=False)
# env = gym.make("LunarLander-v2", continuous=True, render_mode='human')
env = gym.make("LunarLander-v2", continuous=True)
agent = Agent(state_size, action_size)
score_history = []
avg_score_history = []
n_episodes = 20000

for i in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state[0], [1, state_size])
    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _, _ = env.step(action)
        next_state = next_state.reshape(1, state_size)
        agent.store_data(state, action, reward, next_state, done)
        agent.learn()
        
        state = next_state
        score += reward
        
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    avg_score_history.append(avg_score)
    print('Episode ', i, '- score ', score, '- average score ', avg_score, '- buffer size', len(agent.buffer) )
    

Episode  0 - score  -76.81468705070478 - average score  -76.81468705070478 - buffer size 67
Episode  1 - score  -177.2147871122919 - average score  -127.01473708149834 - buffer size 149
Episode  2 - score  -111.589781238119 - average score  -121.87308513370522 - buffer size 238
Episode  3 - score  -115.54207700755 - average score  -120.29033310216641 - buffer size 316
Episode  4 - score  -3.896377477973772 - average score  -97.01154197732788 - buffer size 386
Episode  5 - score  -151.07345117203036 - average score  -106.02186017644497 - buffer size 461
Episode  6 - score  -190.9994900842085 - average score  -118.16152159183977 - buffer size 541
Episode  7 - score  -147.21014699137072 - average score  -121.79259976678112 - buffer size 603
Episode  8 - score  -161.76763859518843 - average score  -126.23427074771527 - buffer size 672
Episode  9 - score  -122.79223587046899 - average score  -125.89006725999066 - buffer size 743
Episode  10 - score  -127.86437067460588 - average score  -126

KeyboardInterrupt: 

In [34]:
env = gym.make("LunarLander-v2", continuous=True, render_mode='human')
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
while not done:
    action = agent.select_action(state, evaluate=True)
    next_state, reward, done, _, _ = env.step(action)
    next_state = next_state.reshape(1, state_size)
    
    state = next_state
    score += reward

print(score)

201.56086448540046


: 

In [None]:
agent = Agent(action_size=action_size, state_size=state_size)
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = agent.select_action(state)
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # insert data to the buffer
    agent.store_data(state, action, reward, next_state, done)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

In [None]:
minibatch = random.sample(agent.buffer, min(agent.buffer_size, agent.batch_size))
states, actions, rewards, next_states, dones = zip(*minibatch)

# convert to tensor, we want action to be integer
states = tf.convert_to_tensor(states, dtype=tf.float32)
actions = tf.convert_to_tensor(actions, dtype=tf.float32)
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
dones = tf.convert_to_tensor(dones, dtype=tf.float32)

with tf.GradientTape() as tape1:
    actions_next_states = agent.actor_target(tf.squeeze(next_states))
    Q_value_next_states = tf.squeeze(agent.critic_target(tf.concat([tf.squeeze(next_states), actions], axis=1)))
    y = rewards + agent.gamma * Q_value_next_states * (1 - dones)
    
    Q_value_current_states = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), actions], axis=1)))
    critic_loss = tf.reduce_mean(tf.square(y - Q_value_current_states))
    
with tf.GradientTape() as tape2:
    new_actions = agent.actor_main(tf.squeeze(states))
    actor_loss = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), new_actions], axis=1)))
    actor_loss = - tf.reduce_mean(actor_loss)

In [None]:
grads1 = tape1.gradient(critic_loss, agent.critic_main.trainable_variables)
grads2 = tape2.gradient(actor_loss, agent.actor_main.trainable_variables)

In [None]:
grads1

[<tf.Tensor: shape=(28, 64), dtype=float32, numpy=
 array([[ 0.00489324, -0.01370343, -0.00751826, ...,  0.00290921,
          0.00032919,  0.        ],
        [ 0.00013398,  0.00071776,  0.00165345, ..., -0.00038967,
         -0.00037362,  0.        ],
        [ 0.00122247, -0.00283493, -0.00242021, ...,  0.00237815,
         -0.00031732,  0.        ],
        ...,
        [-0.00044105,  0.00032489,  0.00240702, ...,  0.00279206,
          0.00202019,  0.        ],
        [ 0.00058007, -0.00250428, -0.00076549, ...,  0.00330609,
          0.00085936,  0.        ],
        [-0.00018436,  0.00207003,  0.0040815 , ...,  0.00070595,
          0.00075413,  0.        ]], dtype=float32)>,
 <tf.Tensor: shape=(64,), dtype=float32, numpy=
 array([ 3.03025846e-03, -1.56086776e-02, -2.78448518e-02,  4.05354165e-02,
        -4.93559346e-04,  3.47282402e-02,  0.00000000e+00, -1.60076655e-03,
        -2.53549777e-02,  4.64251302e-02,  0.00000000e+00,  0.00000000e+00,
        -2.23054066e-02, -3.74

In [None]:
state

array([ 2.7473355e-03, -2.0769508e-05,  1.6156579e-03, -1.5999861e-02,
        9.1864973e-02, -2.1321205e-03,  8.6032331e-01,  2.9782327e-03,
        1.0000000e+00,  3.2270055e-02, -2.1319729e-03,  8.5387504e-01,
        1.5078549e-03,  1.0000000e+00,  4.4081411e-01,  4.4582021e-01,
        4.6142289e-01,  4.8955029e-01,  5.3410292e-01,  6.0246116e-01,
        7.0914906e-01,  8.8593203e-01,  1.0000000e+00,  1.0000000e+00],
      dtype=float32)

In [None]:
agent.actor_main(state)

ValueError: Exception encountered when calling layer 'sequential_104' (type Sequential).

Input 0 of layer "dense_312" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (24,)

Call arguments received by layer 'sequential_104' (type Sequential):
  • inputs=tf.Tensor(shape=(24,), dtype=float32)
  • training=None
  • mask=None