In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    !apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    %pip install -U tf-agents pyvirtualdisplay
    %pip install -U gym>=0.21.0
    %pip install -U gym[box2d,atari,accept-rom-license]

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
from game import Game

In [2]:
from tf_agents.environments.utils import validate_py_environment
#tf.debugging.experimental.enable_dump_debug_info('/tmp/dump')
tf.config.optimizer.set_experimental_options({"function_optimization": False, 'arithmetic_optimization': False})
env = Game(discount = 0.99)
obs_spec = env.observation_spec()
print(obs_spec.check_array(np.array([[0, 0, 4, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 2]])))
#print(env.action_spec())
validate_py_environment(env, episodes=20)

True


In [3]:
time_step = env.reset()
time_step

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([[0, 0, 0, 0],
       [0, 0, 0, 2],
       [0, 0, 2, 0],
       [0, 0, 0, 0]]),
 'reward': array(0., dtype=float32),
 'step_type': array(0)})

In [4]:
time_step = env.step(2)
time_step

TimeStep(
{'discount': array(0.99, dtype=float32),
 'observation': array([[0, 0, 0, 0],
       [0, 0, 0, 2],
       [0, 2, 0, 2],
       [0, 0, 0, 0]]),
 'reward': array(0., dtype=float32),
 'step_type': array(1)})

In [5]:
from tf_agents.environments.wrappers import ActionRepeat
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.environments import tf_environment
env = ActionRepeat(env, times = 2)
tf_env = TFPyEnvironment(env)
#tf_env = env
print(isinstance(tf_env, tf_environment.TFEnvironment))
print("TimeStep Specs:", tf_env.time_step_spec())
print("Action Specs:", tf_env.action_spec())

True
TimeStep Specs: TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(4, 4), dtype=tf.int32, name='observation', minimum=array(0), maximum=array(2048)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
Action Specs: BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0), maximum=array(3))


In [6]:
from tf_agents.networks.q_network import QNetwork
add_layer = keras.layers.Lambda(lambda x: x + 1)
log_layer = keras.layers.Lambda(lambda x: tf.experimental.numpy.log2(x))
normalize_layer = keras.layers.Lambda(lambda x: x / 11)
flatten_layer = keras.layers.Flatten(input_shape=[4,4])
preprocessing_layer = keras.models.Sequential([
    flatten_layer,
    add_layer,
    log_layer,
    normalize_layer
])
fc_layer_params = [200, 100, 20, 10]
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers = preprocessing_layer,
    fc_layer_params = fc_layer_params)



In [7]:
from tf_agents.agents.dqn.dqn_agent import DqnAgent

train_step = tf.Variable(0)
update_period = 2 # run a training step every 4 collect steps
optimizer = keras.optimizers.RMSprop(learning_rate=2.5e-4, rho=0.95, momentum=0.0,
                                     epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0, # initial ε
    decay_steps= 200, # <=> 1,000,000 ALE frames
    end_learning_rate=0.01) # final ε
agent = DqnAgent(tf_env.time_step_spec(),
                 tf_env.action_spec(),
                 q_network=q_net,
                 optimizer=optimizer,
                 target_update_period=2000, # <=> 32,000 ALE frames
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=0.99, # discount factor
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()



In [8]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec = agent.collect_data_spec,
    batch_size = tf_env.batch_size,
    max_length = 100000)

In [9]:
replay_buffer_observer = replay_buffer.add_batch

In [10]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print(f"\r{self.counter}/{self.total}", end = "")
            

In [11]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

In [12]:
from tf_agents.eval.metric_utils import log_metrics
import logging
logging.getLogger().setLevel(logging.INFO)
log_metrics(train_metrics)

INFO:absl: 
		 NumberOfEpisodes = 0
		 EnvironmentSteps = 0
		 AverageReturn = 0.0
		 AverageEpisodeLength = 0.0


In [13]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + train_metrics,
    num_steps = update_period)

In [14]:
from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch, ShowProgress(2000)],
    num_steps=2000)
final_time_step, final_policy_state = init_driver.run()

2000/2000

In [15]:
trajectories, buffer_info = next(iter(replay_buffer.as_dataset(
    sample_batch_size=2,
    num_steps=3,
    single_deterministic_pass=False)))

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [16]:
trajectories._fields

('step_type',
 'observation',
 'action',
 'policy_info',
 'next_step_type',
 'reward',
 'discount')

In [17]:
trajectories.observation.shape

TensorShape([2, 3, 4, 4])

In [18]:
from tf_agents.trajectories.trajectory import to_transition

time_steps, action_steps, next_time_steps = to_transition(trajectories)
time_steps.observation.shape

TensorShape([2, 2, 4, 4])

In [19]:
trajectories.step_type.numpy()

array([[1, 1, 1],
       [1, 1, 1]])

In [20]:
plt.figure(figsize=(10, 6.8))
for row in range(2):
    for col in range(3):
        print(trajectories.observation[row, col].numpy())

[[ 0  4  8  2]
 [32  8  4  2]
 [ 2  0  4  8]
 [ 0  4  8  2]]
[[ 0  0  2  0]
 [ 0  4  2  4]
 [32  8 16  8]
 [ 2  4  8  2]]
[[ 0  0  0  4]
 [ 2  4  2  4]
 [32  8 16  8]
 [ 2  4  8  2]]
[[2 8 4 0]
 [4 0 0 0]
 [8 0 0 0]
 [0 0 2 0]]
[[2 8 4 2]
 [4 2 2 0]
 [8 0 0 0]
 [0 0 0 0]]
[[2 8 4 2]
 [8 0 0 0]
 [8 0 0 0]
 [2 0 0 2]]


<Figure size 720x489.6 with 0 Axes>

In [21]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=64,
    num_steps=2,
    num_parallel_calls=3).prefetch(3)

In [35]:
from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [77]:
@tf.function
def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        #print(trajectories)
        #print(buffer_info)
        train_loss = agent.train(trajectories)
        print([method_name for method_name in dir(train_loss.loss)
                  if callable(getattr(train_loss.loss, method_name))])
        print(f"\r{iteration} loss:{train_loss.loss._shape_as_list}", end="")
        if iteration % 1000 == 0:
            log_metrics(train_metrics)

In [None]:
train_agent(50000)

INFO:absl: 
		 NumberOfEpisodes = Tensor("NumberOfEpisodes:0", shape=(), dtype=int64)
		 EnvironmentSteps = Tensor("EnvironmentSteps:0", shape=(), dtype=int64)
		 AverageReturn = Tensor("StatefulPartitionedCall_2:0", shape=(), dtype=float32)
		 AverageEpisodeLength = Tensor("StatefulPartitionedCall_3:0", shape=(), dtype=float32)


996 loss:<bound method Tensor._shape_as_list of <tf.Tensor 'StatefulPartitionedCall_1995:0' shape=() dtype=float32>>

INFO:absl: 
		 NumberOfEpisodes = Tensor("NumberOfEpisodes_1:0", shape=(), dtype=int64)
		 EnvironmentSteps = Tensor("EnvironmentSteps_1:0", shape=(), dtype=int64)
		 AverageReturn = Tensor("StatefulPartitionedCall_2004:0", shape=(), dtype=float32)
		 AverageEpisodeLength = Tensor("StatefulPartitionedCall_2005:0", shape=(), dtype=float32)


1443 loss:<bound method Tensor._shape_as_list of <tf.Tensor 'StatefulPartitionedCall_2891:0' shape=() dtype=float32>>