# DQN v1 batch

In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]



In [None]:
import gym
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from keras import layers
import matplotlib.pyplot as plt
import random
from collections import deque
import datetime

Disable GPU computation for local devices


In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

### Utils function to transform discrete states into one-hot vector

In [None]:
def discrete_input(state_discrete: tuple, env_dim: gym.spaces.tuple.Tuple):
    one_hot_state = []
    for i_pos, dim in zip(state_discrete, env_dim):
        temp = np.zeros(dim.n)
        temp[i_pos] = 1
        one_hot_state.append(temp)

    return np.concatenate(one_hot_state)

## Gym selection and basic configurations

In [None]:
gym_name_list = [
    {
        'name': 'CartPole-v0',
        'goal': 180,
        'v_min': 0,
        'v_max': 210,
        'ep': 50
    },
    {
        'name': 'MountainCar-v0',
        'goal': -150, 
        'v_min': -210,
        'v_max': 0,
        'ep': 20
    },
    {
        'name': 'Blackjack-v0',
        'goal': 0.10,
        'v_min': -20,
        'v_max': 20,
        'ep': 1000
    },
    {
        'name': 'LunarLander-v2',
        'goal': 200,
        'v_min': 250,
        'v_max': -250,
        'ep': 50
    }
]

env_i = 3 #@param {type:"slider", min:0, max:3, step:1}

save_model: bool = False # @param {type:"boolean"}
show_plots: bool = True # @param {type:"boolean"}
render_env: bool = False # @param {type:"boolean"}
seed = 42 # @param {type:"integer"}

max_steps_per_episode = 400 # @param {type:"integer"}

stopping_reward_criteria = gym_name_list[env_i]['goal']

gym_name = gym_name_list[env_i]['name']

env = gym.make(gym_name)  # Create the environment

env.seed(seed)

if isinstance(env.observation_space, gym.spaces.tuple.Tuple):
    env = gym.wrappers.TransformObservation(env, lambda obs: discrete_input(obs, env.observation_space))
    num_inputs = sum([x.n for x in env.observation_space])  # 4
else:
    num_inputs = env.observation_space.shape[0]  # 4
num_actions = env.action_space.n  # 2

## Algorithm hyper-parameters

In [None]:
# Factor of the ema that displays that tracks the averaged rewards
ema_ratio = 0.01  # @param {type:"number"}

# Ratio between generating experiences and sampling for training
training_ratio: int = 4 # @param  {type:"integer"}

# Size of the batch when sampling experiences
batch_size: int = 32 # @param {type:"integer"}

# Size of the buffer that stores the experiences
mem_length: int = 4096 # @param {type:"integer"}

# Discount factor for estimaing the futures rewards
gamma: float = 0.99  # @param {type:"number"}

# Initial and last probability for choosing exploration instead of explotation
epsilon: float = 1.0 # @param {type:"number"}
epsilon_min: float = 0.05 # @param {type:"number"}

# This is an estimation of the training iterations to tune the epsilon decay
approx_iterations: float = 5e6 # @param {type:"number"}

# The epsilon_decay reduce the exploration probability after each iteration
epsilon_decay: float = (epsilon_min / epsilon) ** (1 / approx_iterations)

# Factor of the ema that controls the updating weights of the target network
tau: float = 0.125 # @param {type:"number"}

# The usual factor that controls the amount of change the weights are updated
learning_rate = 0.05 # @param {type:"number"}

# For enabling the double dqn learning when choosing next Q-values
double_dqn_learning: bool = True # @param {type:"boolean"}

# Factor to define heuristically the size of the hidden layer.
hidden_size_factor = 16 # @param {type:"integer"}
num_hidden = num_inputs * num_actions * hidden_size_factor

## Load DQN models as Q-table approximators
We start with double DQN, where:
*   q_model estimates the Q-values used for action selection.
*   t_model is responsible for estimating the target Q values on training.

In [None]:
q_model = keras.Sequential(layers=[layers.Input(shape=(num_inputs,)),
                                   layers.Dense(num_hidden, activation="relu"),
                                   layers.Dense(num_actions)],
                           name="q_model")

t_model = keras.Sequential(layers=[layers.Input(shape=(num_inputs,)),
                                   layers.Dense(num_hidden, activation="relu"),
                                   layers.Dense(num_actions)],
                           name="t_model")

optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
# loss_function = keras.losses.Huber()
loss_function = keras.losses.MeanSquaredError()


# Preprocessing to convert the state into a tensor
def state_to_tensor(state):
  state = tf.convert_to_tensor(state)
  state = tf.expand_dims(state, 0)
  return state


Just checking the model can be saved. Not needed for the training.

In [None]:
# implementation = "DQN_v1_LunarLanding_v2"
# episode_count = 0
# model_folder = os.path.join("./models", gym_name, implementation, "Ep_" + str(episode_count).zfill(5), "model")
# if not os.path.exists(model_folder):
#   os.makedirs(model_folder)
# q_model.save_weights(filepath=model_folder, save_format="tf")

### Train model (agent) from past experiences

In [72]:
def train_agent(samples):
  state_batch = tf.concat([s for s, a, r, n_s, d in samples], axis=0)
  action_batch = tf.concat([a for s, a, r, n_s, d in samples], axis=0)
  reward_batch = tf.cast(tf.concat([r for s, a, r, n_s, d in samples], axis=0),
                         dtype=tf.float32)
  next_state_batch = tf.concat([n_s for s, a, r, n_s, d in samples], axis=0)
  not_done_batch = tf.concat([float(not d) for s, a, r, n_s, d in samples], axis=0)

  # Create a mask so we only calculate loss on the updated Q-values
  masks = tf.one_hot(action_batch, num_actions)

  # Build the updated Q-values for the sampled future states
  # Use the target model for stability
  future_t = t_model(next_state_batch)

  if double_dqn_learning:
      future_q = q_model(next_state_batch)
      best_future_action = tf.argmax(future_q, axis=-1)
      next_action_mask = tf.one_hot(best_future_action, num_actions)
      future_q_action = tf.reduce_sum(tf.multiply(future_t, next_action_mask), axis=1)
  else:
      future_q_action = tf.reduce_max(future_t, axis=1)

  # Q value = reward + discount factor * expected future reward
  updated_q_values = reward_batch + gamma * tf.multiply(future_q_action, not_done_batch)

  with tf.GradientTape() as tape:
      q_values = q_model(state_batch)

      # Apply the masks to the Q-values to get the Q-value for action taken
      q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
      loss = loss_function(q_action, updated_q_values)

      # Backpropagation
      grads = tape.gradient(loss, q_model.trainable_variables)
      optimizer.apply_gradients(zip(grads, q_model.trainable_variables))

  return loss.numpy()

### Transfer weights to target model

In [None]:
def learning_transfer(q_model, t_model, tau):
  weights = q_model.get_weights()
  target_weights = t_model.get_weights()
  for j in range(len(target_weights)):
      target_weights[j] = weights[j] * tau + target_weights[j] * (1 - tau)
  t_model.set_weights(target_weights)


### Epsilon-greedy function to select an action

In [None]:
def agent_best_action(state):
  return np.argmax(q_model(state, training=False))

def select_action(state, epsilon):
  epsilon *= epsilon_decay
  epsilon = max(epsilon_min, epsilon)
  
  if np.random.random() < epsilon:
      action = env.action_space.sample()
  else:
      action = agent_best_action(state)
  return action, epsilon

## Tensorboard configuration

In [None]:
implementation = "DQN_v1_LunarLanding_v2"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = os.path.join("logs", gym_name, implementation, "T_" + current_time)
summary_writer = tf.summary.create_file_writer(train_log_dir)

In [93]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# from tensorboard import notebook
# notebook.list() # View open TensorBoard instances

# # Control TensorBoard display. If no port is provided, 
# # the most recently launched TensorBoard is used
# notebook.display(port=6006, height=1000) 

%tensorboard --logdir ./logs

# # IF TENSORBOARD DOES NOT LOAD TRY THIS:
# %reload_ext tensorboard
# %tensorboard --logdir ./logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 400), started 0:49:27 ago. (Use '!kill 400' to kill it.)

<IPython.core.display.Javascript object>

## Main learning loop
It consist in main two steps:

1.   Collect experiences from the environment in episodes, with exploitation/exploration trade-off
2.   Sample past experiences to train the model using the Bellman equation.

Other sections are the mean reward tracking, stop-learning trigger, display status.


In [73]:
memory = deque(maxlen=mem_length)
running_reward = None
episode_count = 0
epoch = 0
historic_reward = []
while True:  # Run until solved

  state = env.reset()
  state = state_to_tensor(state)

  episode_reward = 0
  for time_step in range(1, max_steps_per_episode):
    # env.render(); Adding this line would show the attempts
    # of the agent in a pop up window.

    action, epsilon = select_action(state, epsilon)

    # Apply the sampled action in our environment
    next_state, reward, done, _ = env.step(action)        
    next_state = state_to_tensor(next_state)

    memory.append([state, action, reward, next_state, done])
    episode_reward += reward

    # ##### TRAIN MODEL ###############
    if len(memory) >= 2 * batch_size and time_step % training_ratio == 0:
      samples = random.sample(memory, batch_size)

      loss = train_agent(samples)

      # Transfer weights to target model
      learning_transfer(q_model, t_model, tau)

      if running_reward is not None:
        with summary_writer.as_default():
          tf.summary.scalar('loss', loss, step=epoch)
          tf.summary.scalar('ema_reward', running_reward, step=epoch)
          tf.summary.scalar('epsilon', epsilon, step=epoch)
        epoch += 1
    state = next_state

    if done:
      break

  if running_reward is None:
    running_reward = episode_reward

  # Update running reward to check condition for solving
  running_reward = ema_ratio * episode_reward + (1 - ema_ratio) * running_reward
  historic_reward.append(running_reward)

  # Log details
  episode_count += 1
  if episode_count % gym_name_list[env_i]['ep'] == 0 and 'loss' in locals():
    template = "running reward: {:.2f} at episode {} with epsilon {:.2f} and loss {:.2f}"
    print(template.format(running_reward, episode_count, epsilon, loss))

  # Condition to consider the task solved
  if running_reward > stopping_reward_criteria:
    print("Solved at episode {}!".format(episode_count))
    break

  if show_plots and episode_count % 10000000 == 0:
    plt.plot(historic_reward)
    plt.show()

running reward: -137.19 at episode 50 with epsilon 0.98 and loss 905.21
running reward: -156.45 at episode 100 with epsilon 0.98 and loss 5952.27
running reward: -176.62 at episode 150 with epsilon 0.97 and loss 2913.18
running reward: -170.99 at episode 200 with epsilon 0.97 and loss 13543.19
running reward: -177.38 at episode 250 with epsilon 0.97 and loss 3829.33
running reward: -182.64 at episode 300 with epsilon 0.96 and loss 27144.95
running reward: -186.00 at episode 350 with epsilon 0.96 and loss 99537.86
running reward: -197.66 at episode 400 with epsilon 0.96 and loss 2528225.25


KeyboardInterrupt: ignored

## Save model weights for a later use (optional).

In [None]:
if save_model:
  model_folder = os.path.join("./models", gym_name, implementation, "Ep_" + str(episode_count).zfill(5), "model")
  if not os.path.exists(model_folder):
    os.makedirs(model_folder)
  q_model.save_weights(filepath=model_folder, save_format="tf")

## Play with the trained agent

In [None]:
episodes = 100 # @param {type:"integer"}
deterministic = True # @param {type:"boolean"}

agent_rewards = []
for env_i in range(episodes):
  state = env.reset()
  episode_reward = 0

  for time_step in range(1, max_steps_per_episode):
    if render_env and gym_name != 'Blackjack-v0':
      env.render()  # Show the attempts of the agent in a pop up window.

    state = state_to_tensor(state)

    if deterministic:
      action = agent_best_action(state)
    else:
      action, epsilon0 = select_action(state, 0.2)

    # Apply the sampled action in our environment
    state, reward, done, _ = env.step(action)
    episode_reward += reward

    if done:
      break
  agent_rewards.append(episode_reward)
  
print(f"After 100 episodes the mean reward is {np.mean(agent_rewards)}")

if show_plots:
  num_bins = 50
  x = np.array(agent_rewards)
  fig, ax = plt.subplots()

  # the histogram of the data
  n, bins, patches = ax.hist(x, num_bins, density=1)

  ax.set_xlabel('Episode rewards')
  ax.set_ylabel('Probability density')
  ax.set_title(f'Mean {np.mean(x).round(2)} +/- {np.std(x).round(2)}')

  # Tweak spacing to prevent clipping of ylabel
  fig.tight_layout()
  plt.show()

print("End of script!")