In [1]:
# Fixing a jupyter autcomplete bug
%config Completer.use_jedi = False

# Installation steps
# !python3 -m pip install -U tf-agents
# !python3 -m pip install -U 'gym[atari]'
# If on windows use:
#    !python3 -m pip install git+https://github.com/Kojoley/atari-py.git

# TF Agents

Architecture:

![tf agents architecture](https://miro.medium.com/max/2360/1*pJ5utPNBZFcJRhdxX-Fsfw.png)

* Environment**s** to run in parallel, keep the GPU occupied
* Oberserver for architecture flexibility

## Environment

In [2]:
from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4

max_episode_steps = 27_000  # x 4 for actual frames
environment_name = "BreakoutNoFrameskip-v4"

env = suite_atari.load(
    environment_name,
    max_episode_steps = max_episode_steps,
    gym_env_wrappers = [AtariPreprocessing, FrameStack4]
)

In [3]:
# Wrapping as to become part of the tf graph

from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(env)

## Q-Network

In [4]:
import tensorflow as tf
import tensorflow.keras as keras
from tf_agents.networks.q_network import QNetwork

preprocessing_layer = keras.layers.Lambda(
    lambda obs: tf.cast(obs, np.float32)/255.
)

conv_layer_params = [
    (32, (8, 8), 4),
    (64, (4, 4), 2),
    (64, (3, 3), 1),
]

fc_layer_params = [512]

q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params
)

## DQN Agent

In [5]:
from tf_agents.agents.dqn.dqn_agent import DqnAgent

train_step = tf.Variable(0)
update_period = 4  # Training the model every N steps
optimizer = keras.optimizers.RMSprop(
    lr=2.5e-4, rho=0.95, momentum=0.0,
    epsilon=0.00001, centered=True
)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate = 1.0,  # initial ε
    decay_steps = 250_000 // update_period,  # <=> 1,000,000 ALE frames
    end_learning_rate = 0.01  # final ε
)

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network = q_net,
    optimizer = optimizer,
    target_update_period = 2000,  # <=> 32,000 ALE frames
    td_errors_loss_fn = keras.losses.Huber(reduction="none"),
    gamma = 0.99,  # discount factor
    train_step_counter = train_step,
    epsilon_greedy = lambda: epsilon_fn(train_step)
)

agent.initialize()

## Replay Buffer & Observer

In [6]:
# RAM Usage
print(f'{100_000 * (84*84*4 + 5*4) / 1e9:.2f} GB')

2.82 GB


In [7]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec = agent.collect_data_spec,
    batch_size = tf_env.batch_size,
    max_length = 100_000  # Set to 1 Million for replicating the dqn paper
)

In [8]:
replay_buffer_observer = replay_buffer.add_batch

## Metrics

In [9]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

## Collect Driver

In [10]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers = [replay_buffer_observer] + train_metrics,
    num_steps = update_period
)

## Collect Random Experiences

In [11]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total

    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(
    tf_env.time_step_spec(),
    tf_env.action_spec()
)

init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer_observer, ShowProgress(20000)],
    num_steps=20000  # <=> 80,000 ALE frames
)

final_time_step, final_policy_state = init_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
20000/20000

In [12]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=64,
    num_steps=2,  # 2 steps --> 1 transition
    num_parallel_calls=12
).prefetch(12)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


## Training the Agent

In [13]:
from tf_agents.eval.metric_utils import log_metrics

def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(iteration, train_loss.loss.numpy()), end="")
        if iteration % 1000 == 0:
            log_metrics(train_metrics)

In [14]:
train_agent(1_000)  # Should run 10 Million steps for the dqn paper results.

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
999 loss:0.00736