Code based on https://www.baeldung.com/cs/reinforcement-learning-neural-network

In [None]:
import base64
import imageio
import IPython
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gym
import tensorflow as tf
from datetime import datetime

from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import suite_gym, tf_py_environment, TimeLimit
from tf_agents.environments.gym_wrapper import GymWrapper
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import random_tf_policy, epsilon_greedy_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

np.random.seed(42)


def now():
    return datetime.now().strftime("%Y-%M-%d %H:%M:%S")

# Create the environment with slight adjustments

In [None]:
class NegativeRewardOnDeadEnv(gym.Wrapper):
    """Gives a negative reward when the agent falls into the water.
    """

    def __init__(self, env):
        super(NegativeRewardOnDeadEnv, self).__init__(env)
        self.dead_states = [5, 7, 11, 12]
        self.gold_state = 15
        
    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        if ob in self.dead_states:
            reward = -1
        if ob == self.gold_state:
            reward = 10
        else:
            reward = -1
        return ob, reward, done, info


env_name = 'FrozenLake-v1'
train_gym_env = NegativeRewardOnDeadEnv(gym.make(env_name, is_slippery=False))
eval_gym_env = NegativeRewardOnDeadEnv(gym.make(env_name, is_slippery=False))
eval_gym_env.render()

In [None]:
train_py_env = TimeLimit(GymWrapper(train_gym_env), duration=50)
eval_py_env = TimeLimit(GymWrapper(eval_gym_env), duration=50)

In [None]:
print('Observation Spec:\n', train_py_env.time_step_spec().observation)
print('Reward Spec:\n', train_py_env.time_step_spec().reward)
print('Action Spec:\n', train_py_env.action_spec())

In [None]:
# convert numpy arrays to tensors within the environment
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

# Create the agent and its Q-network

In [None]:
tf.keras.backend.clear_session()

# network with one final Dense layer that use num_actions output nodes
network_layers = [
    tf.keras.layers.Lambda(lambda x: tf.one_hot(x, depth=16)),
    tf.keras.layers.Dense(20, 
                          activation=tf.keras.activations.relu,
                          name='input_layer'),
    tf.keras.layers.Dense(train_py_env.action_spec().num_values, 
                          activation='linear', 
                          kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.03, maxval=0.03), 
                          bias_initializer=tf.keras.initializers.Constant(0.0),
                          name='output_layer')
]

q_net = sequential.Sequential(network_layers)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    epsilon_greedy=1.1,
    target_update_period=1000,
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=0.99,
    train_step_counter=train_step_counter)

agent.initialize()

agent._q_network.summary()

# Setup policies

In [None]:
eval_policy = agent.policy  # greedy policy
collect_policy = agent.collect_policy  # epsilon-greedy policy

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())  # random agent

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    total_steps = 0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            total_return += time_step.reward
            total_steps += 1

    avg_return = total_return / num_episodes
    avg_steps = total_steps / num_episodes
    return avg_return.numpy()[0], avg_steps

In [None]:
# average return under random policy
compute_avg_return(eval_env, random_policy, num_episodes=100)

# Create replay buffer

In [None]:
replay_buffer_max_length = 10000

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,  # train_env.batch_size=1
    max_length=replay_buffer_max_length
)

In [None]:
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)


def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)

        
initial_collect_steps = 100
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

In [None]:
batch_size = 16

# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2
).prefetch(3)

dataset

In [None]:
iterator = iter(dataset)
iterator

### Deep dive in loss calculation

# Train agent

In [None]:
def update_collect_policy_epsilon(agent, new_epsilon):
    """Utility function to update the collect_policies' epsilon.
    """
    agent._epsilon_greedy = new_epsilon
    agent._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(agent.policy, epsilon=agent._epsilon_greedy)

In [None]:
num_iterations = 6000
num_epsilon_greedy_steps = 3000
num_eval_episodes = 10
collect_steps_per_iteration = 5
log_interval = 500

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return, avg_steps = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns_lst = [avg_return]
steps_lst = [avg_steps]

print("[{}] Starting training...".format(now()))
for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()
        
    new_epsilon = max(agent._epsilon_greedy - 1 / num_epsilon_greedy_steps, 0.1)
    update_collect_policy_epsilon(agent, new_epsilon)
        
    if step % log_interval == 0:
        avg_return, avg_steps = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print("[{}]".format(now()) + f" step = {step}: loss = {train_loss:<17,.10f} avg return = {avg_return:<10,.2f} avg steps = {avg_steps:.2f}")
        returns_lst.append(avg_return)
        steps_lst.append(avg_steps)

# Visualize q-values for all states

In [None]:
fig, [ax, ax2] = plt.subplots(2, 1, figsize=(15, 10))

iterations = list(range(0, len(returns_lst) * log_interval, log_interval))
ax.plot(iterations, returns_lst, lw=2.5, alpha=0.8, label='returns')
ax.set_ylabel('Average Return', fontsize=14)
ax.set_xlabel('Gradient Steps', fontsize=14)
ax.hlines(ax.get_yticks()[1:-1], iterations[0], iterations[-1], lw=0.5, alpha=0.5, ls='--', color='black')
ax.legend(fontsize=13)

ax2.plot(iterations, steps_lst, lw=1.5, alpha=0.7, color='black', label='game steps')
ax2.set_ylabel('Steps per game', fontsize=14)
ax2.set_xlabel('Gradient Steps', fontsize=14)
ax2.hlines(ax2.get_yticks()[1:-1], iterations[0], iterations[-1], lw=0.5, alpha=0.5, ls='--', color='black')
ax2.legend(fontsize=13);

In [None]:
q_table = agent._q_network(np.arange(16))[0].numpy()
q_table = pd.DataFrame(data=q_table, columns=['left', 'down', 'right', 'up'])
q_table.index.name = 'state'

In [None]:
# heatmap expected reward non-terminal states
fig, ax = plt.subplots(figsize=(3, 6))

terminal_states = eval_py_env.dead_states + [eval_py_env.gold_state]
sns.heatmap(q_table.loc[~q_table.index.isin(terminal_states)], annot=q_table.loc[~q_table.index.isin(terminal_states)], cmap='coolwarm');

In [None]:
# heatmap expected reward all states
fig, ax = plt.subplots(figsize=(3, 7))

sns.heatmap(q_table, annot=q_table, cmap='coolwarm');