In [7]:
import tensorflow as tf
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.models import Sequential

import numpy as np
import random
import gym
import matplotlib.pyplot as plt

from collections import deque

# CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING: https://arxiv.org/pdf/1509.02971




In [8]:
env = gym.make("LunarLander-v2", continuous=True, render_mode='human')

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

print(state_size, action_size)

8 2


In [9]:
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = np.random.uniform(low=-1, high=1, size=(4,))
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

KeyboardInterrupt: 

In [10]:
class Agent():
    def __init__(self, state_size, action_size,
                 lr_actor=0.001, lr_critic=0.001,
                 gamma=0.95, batch_size=32,
                 buffer_size=10**6, min_start=10000) -> None:
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        
        self.buffer = deque(maxlen=self.buffer_size)
        self.min_start = min_start
        
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        
        self.actor_main = self.build_actor_network()
        self.actor_target = self.build_actor_network()
        
        self.critic_main = self.build_critic_network()
        self.critic_target = self.build_critic_network()
        
        self.update_target(tau=1.0)
        
        self.opt_actor = tf.keras.optimizers.legacy.Adam(learning_rate=self.lr_actor)
        self.opt_critic = tf.keras.optimizers.legacy.Adam(learning_rate=self.lr_critic)
        
        self.min_action = -1
        self.max_action = 1
        
        self.train_step = 0
        self.replace_step = 100


    def build_actor_network(self):
        """
        The actor network
        """
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(self.action_size, activation='tanh'))
        
        return model
    
    def build_critic_network(self):
        """
        The critic network used for estimating the value function
        The input is [state, action], output is the Q(s, a)
        """
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size + self.action_size, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='linear'))

        return model
    
    def select_action(self, state, evaluate=False):
        """
        Return the action value based on the input state

        evaluate (bool, optional): False for training, True for testing.
        """
        actions = self.actor_main(state)
        if not evaluate:
            # we add noise for exploration
            actions += np.random.normal(0, 0.2, (1, action_size))
        # we clip it since it might be out of range after adding noise
        actions = np.clip(actions, self.min_action, self.max_action)
        
        return actions[0]
    
    def store_data(self, state, action, reward, next_state, done):
        """
        Store data into the buffer.
        """
        if len(self.buffer) == self.min_start:
            print("Collect enough samples, training starting")
        # Append the new data to the buffer
        self.buffer.append([state, action, reward, next_state, done])
    
    def update_target(self, tau=0.005):
        """
        Updae the target model using soft update.
        """
        # Iterate through the weights of the target and main models
        for target_weights, main_weights in zip(self.actor_target.weights, self.actor_main.weights):
            # Update the target model weights with a soft update
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)
        
        for target_weights, main_weights in zip(self.critic_target.weights, self.critic_main.weights):
            # Update the target model weights with a soft update
            target_weights.assign(tau * main_weights + (1 - tau) * target_weights)

    
    def learn(self):
        if len(self.buffer) < self.min_start:
            return
        # sample a minibatch from the buffer
        minibatch = random.sample(self.buffer, min(self.buffer_size, self.batch_size))
        states, actions, rewards, next_states, dones = [tf.convert_to_tensor(x, dtype=tf.float32) for x in zip(*minibatch)]

        with tf.GradientTape() as tape1:
            actions_next_states = self.actor_target(tf.squeeze(next_states))
            Q_value_next_states = tf.squeeze(self.critic_target(tf.concat([tf.squeeze(next_states), actions_next_states], axis=1)))
            y = rewards + self.gamma * Q_value_next_states * (1 - dones)
            
            Q_value_current_states = tf.squeeze(self.critic_main(tf.concat([tf.squeeze(states), actions], axis=1)))
            critic_loss = tf.reduce_mean(tf.square(y - Q_value_current_states))
            
        with tf.GradientTape() as tape2:
            new_actions = self.actor_main(tf.squeeze(states))
            actor_loss = tf.squeeze(self.critic_main(tf.concat([tf.squeeze(states), new_actions], axis=1)))
            actor_loss = - tf.reduce_mean(actor_loss)
        
        grads1 = tape1.gradient(critic_loss, self.critic_main.trainable_variables)
        self.opt_critic.apply_gradients(zip(grads1, self.critic_main.trainable_variables))
        
        grads2 = tape2.gradient(actor_loss, self.actor_main.trainable_variables)
        self.opt_actor.apply_gradients(zip(grads2, self.actor_main.trainable_variables))
        
        if self.train_step % self.replace_step == 0:
            self.update_target()
        self.train_step += 1

In [11]:
# training
env = gym.make("LunarLander-v2", continuous=True)
agent = Agent(state_size, action_size)
score_history = []
avg_score_history = []
n_episodes = 20000

for i in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state[0], [1, state_size])
    time_step = 0  # to count number of steps the ship takes to land
    while not done:
        time_step += 1
        action = agent.select_action(state)
        next_state, reward, done, _, _ = env.step(action)
        next_state = next_state.reshape(1, state_size)
        agent.store_data(state, action, reward, next_state, done)
        agent.learn()
        # if time_step > 1000:
        #     print('Exit this episode because landing takes too long')
        #     break
        state = next_state
        score += reward
        
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    avg_score_history.append(avg_score)
    print('Episode ', i, '- score ', score, '- average score ', avg_score, '- buffer size', len(agent.buffer), '- timestep ', time_step )
    

Episode  0 - score  -452.4157179934169 - average score  -452.4157179934169 - buffer size 121 - timestep  121
Episode  1 - score  -52.98986070397111 - average score  -252.70278934869398 - buffer size 232 - timestep  111
Episode  2 - score  -303.3476710884797 - average score  -269.5844165952892 - buffer size 339 - timestep  107
Episode  3 - score  -506.41198045618205 - average score  -328.79130756051245 - buffer size 469 - timestep  130
Episode  4 - score  -39.77006577747293 - average score  -270.98705920390455 - buffer size 569 - timestep  100
Episode  5 - score  -484.5778481870514 - average score  -306.585524034429 - buffer size 947 - timestep  378
Episode  6 - score  -481.7832219762043 - average score  -331.6137665975398 - buffer size 1069 - timestep  122
Episode  7 - score  -386.36355233068485 - average score  -338.45748981418285 - buffer size 1156 - timestep  87
Episode  8 - score  -326.27083898432653 - average score  -337.10341749975436 - buffer size 1266 - timestep  110
Episode  9

KeyboardInterrupt: 

: 

In [None]:
env = gym.make("LunarLander-v2", continuous=True, render_mode='human')
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
while not done:
    action = agent.select_action(state, evaluate=True)
    next_state, reward, done, _, _ = env.step(action)
    next_state = next_state.reshape(1, state_size)
    
    state = next_state
    score += reward

print(score)

In [None]:
agent = Agent(action_size=action_size, state_size=state_size)
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = agent.select_action(state)
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # insert data to the buffer
    agent.store_data(state, action, reward, next_state, done)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

In [None]:
minibatch = random.sample(agent.buffer, min(agent.buffer_size, agent.batch_size))
states, actions, rewards, next_states, dones = zip(*minibatch)

# convert to tensor, we want action to be integer
states = tf.convert_to_tensor(states, dtype=tf.float32)
actions = tf.convert_to_tensor(actions, dtype=tf.float32)
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
dones = tf.convert_to_tensor(dones, dtype=tf.float32)

with tf.GradientTape() as tape1:
    actions_next_states = agent.actor_target(tf.squeeze(next_states))
    Q_value_next_states = tf.squeeze(agent.critic_target(tf.concat([tf.squeeze(next_states), actions], axis=1)))
    y = rewards + agent.gamma * Q_value_next_states * (1 - dones)
    
    Q_value_current_states = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), actions], axis=1)))
    critic_loss = tf.reduce_mean(tf.square(y - Q_value_current_states))
    
with tf.GradientTape() as tape2:
    new_actions = agent.actor_main(tf.squeeze(states))
    actor_loss = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), new_actions], axis=1)))
    actor_loss = - tf.reduce_mean(actor_loss)

In [None]:
grads1 = tape1.gradient(critic_loss, agent.critic_main.trainable_variables)
grads2 = tape2.gradient(actor_loss, agent.actor_main.trainable_variables)

In [None]:
grads1

In [None]:
state

In [None]:
agent.actor_main(state)