In [None]:
!pip install swig
!pip install box2d-py
!pip install gymnasium[box2d]
!pip install tf_agents

In [None]:
!pip install tf_agents
!pip install swig
!pip install box2d-py
!pip install gym[box2d]
!pip install gym[atari]
!pip install autorom[accept-rom-license]

In [None]:
import gymnasium as gym

In [14]:
import gym

In [2]:
import os
import time
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers

In [3]:
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4
from tf_agents.environments import suite_atari
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.metrics import tf_metrics
from tf_agents.eval.metric_utils import log_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.trajectories.trajectory import to_transition
from tf_agents.utils.common import function

In [7]:
from google.colab import drive
drive.mount("/content/gdrive")
drive_path = "gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/chapter18"

Mounted at /content/gdrive


In [None]:
os.makedirs(f"{drive_path}/models", exist_ok=True)

# 8.

In [None]:
env = gym.make("LunarLander-v2")
env.max_episode_steps = 500

In [None]:
model = keras.models.Sequential([
    layers.Dense(16, activation="relu", input_shape=[8]),
    layers.Dense(16, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(4, activation="softmax")
])

In [None]:
onehot_encodings = tf.one_hot([0, 1, 2, 3], 4)

In [None]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        probabilities = model(obs[np.newaxis])
        probabilities = tf.cast(probabilities[-1], tf.float64)
        probabilities /= np.sum(probabilities)
        action = np.random.choice(4, p=probabilities)
        y_target = onehot_encodings[action]
        loss = tf.reduce_mean(loss_fn(y_target, probabilities))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, truncated, info = env.step(action)
    return obs, reward, done, grads

In [None]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()[0]
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [None]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted


def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [None]:
checkpoint_dir = time.strftime(f"{drive_path}/models/lunarlander_%Y_%m_%d-%H_%M_%S")
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "lunarlander")
checkpoint = tf.train.Checkpoint(model)

In [None]:
checkpoint.restore(f"{drive_path}/models/lunarlander_2024_06_28-12_08_31")

In [None]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.99

optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.binary_crossentropy

for iteration in range(n_iterations):
    start = time.time()

    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update,
                                                    n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

    times_won = 0
    for rewards in all_rewards:
        times_won += rewards.count(200)

    # Save the model every 20 epochs
    if (iteration + 1) % 20 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print('Time for iteration {} is {} sec, times won: {}'.format(
        iteration+1, time.time()-start, times_won
    ))

# 9.

In [4]:
max_episode_steps = 50000

env = suite_atari.load(
    "SpaceInvaders-v4",
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessing, FrameStack4]
)
tf_env = TFPyEnvironment(env)

In [5]:
preprocessing_layer = layers.Lambda(
    lambda obs: tf.cast(obs, np.float32) / 255.
)
conv_layer_params = [
    (32, (8, 8), 4),
    (64, (4, 4), 2),
    (64, (3, 3), 1)
]
fc_layer_params = [512]
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params
)

In [6]:
train_step = tf.Variable(0)
update_period = 4
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
                                     epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,  # initial ε
    decay_steps= 1.0 // 0.00001,
    end_learning_rate=0.01  # final ε
)
agent = DqnAgent(tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=2000,
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    gamma=0.9,  # discount factor
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step)
)
agent.initialize()



In [8]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    #max_length=1000000
    max_length=100000
)

In [9]:
replay_buffer_observer = replay_buffer.add_batch

In [10]:
train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

In [11]:
collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + train_metrics,
    num_steps=update_period  # collect 4 steps for each training iteration
)

In [12]:
class ShowProgress():
    def __init__(self, total):
        self.counter = 0
        self.total = total

    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")


In [16]:
initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch, ShowProgress(20000)],
    num_steps=20000
)
final_time_step, final_policy_state = init_driver.run()

20000/20000

In [17]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=64,
    num_steps=2,
    num_parallel_calls=3
)
dataset = dataset.prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [18]:
collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [20]:
checkpoint_dir = time.strftime(f"{drive_path}/models/spaceinvaders_%Y_%m_%d-%H_%M_%S")
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "spaceinvaders")
checkpoint = tf.train.Checkpoint(agent)

In [21]:
def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(iteration, train_loss.loss.numpy()), end="")
        if iteration % 100 == 0:
            log_metrics(train_metrics)
            checkpoint.save(file_prefix=checkpoint_prefix)

In [22]:
train_agent(100000)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


0 loss:0.91701