In [None]:
!pip install swig
!pip install box2d-py
!pip install gymnasium[box2d]
!pip install tf_agents

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tf_agents.environments import suite_gym
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.environments.wrappers import ActionRepeat
import gymnasium as gym
from gymnasium.wrappers import (
    GrayScaleObservation,
    FrameStack,
    ResizeObservation,
    TimeLimit
)
import os
import time

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
drive_path = "gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/chapter18"

In [4]:
os.makedirs(f"{drive_path}/models", exist_ok=True)

# 8.

In [13]:
env = gym.make("LunarLander-v2")

In [14]:
model = keras.models.Sequential([
    layers.Dense(16, activation="relu", input_shape=[8]),
    layers.Dense(16, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(4, activation="softmax")
])

In [15]:
onehot_encodings = tf.one_hot([0, 1, 2, 3], 4)

In [16]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        probabilities = model(obs[np.newaxis])
        probabilities = tf.cast(probabilities[-1], tf.float64)
        probabilities /= np.sum(probabilities)
        action = np.random.choice(4, p=probabilities)
        y_target = onehot_encodings[action]
        loss = tf.reduce_mean(loss_fn(y_target, probabilities))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, truncated, info = env.step(action)
    return obs, reward, done, grads

In [17]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()[0]
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [18]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted


def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [19]:
checkpoint_dir = time.strftime(f"{drive_path}/models/lunarlander_%Y_%m_%d-%H_%M_%S")
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "lunarlander")
checkpoint = tf.train.Checkpoint(model)

In [None]:
checkpoint.restore(f"{drive_path}/models/lunarlander_2024_06_28-12_08_31")

In [None]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.99

optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.binary_crossentropy

for iteration in range(n_iterations):
    start = time.time()

    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn
    )
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

    times_won = 0
    for rewards in all_rewards:
        times_won += rewards.count(200)

    # Save the model every 20 epochs
    if (iteration + 1) % 20 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print('Time for iteration {} is {} sec, times won: {}'.format(
        iteration+1, time.time()-start, times_won
    ))

# 9.

In [None]:
max_episode_steps = 27000

env = suite_gym.load(
    "SpaceInvaders-v4",
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[lambda env: FrameStack(env, 4)],
    env_wrappers=[lambda env: ActionRepeat(env, times=4)]
)