In [38]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
import ale_py
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing
from gymnasium.wrappers.frame_stack import FrameStack
import numpy as np

gym.register_envs(ale_py)

In [None]:
env = gym.make("BreakoutNoFrameskip-v4")
env = AtariPreprocessing(env)
env = FrameStack(env, 4)

num_actions = env.action_space.n
trigger = lambda t: t % 20
env = gym.wrappers.RecordVideo(env, video_folder="./videos", episode_trigger=trigger, disable_logger=True)

In [None]:
def create_q_model():
    return keras.Sequential(
        [
            keras.Input(shape=(84, 84, 4)),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear")
        ]
    )

In [41]:
model = create_q_model()
model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=.00025, clipnorm=1.0)

  super().__init__(**kwargs)


In [42]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
episode_reward_history = []
done_history = []
running_reward = 0
episode_count = 0
frame_count = 0

In [43]:
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
batch_size = 32

# Maximum episodes to run, set to 0 to run indefinitely
max_episodes = 0
# Max frames to run
max_frames = 1e7
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1e6
# Max length of replay buffer
max_memory_length = 1e5
# Abort if more than the below frames are spent in single game (results in truncated = True)
max_steps_per_episode = 10000
# How often should the action network be updated
update_after_actions = 4
# How often should the Q-network be cloned from our action network
update_target_network = 10000
# Use Huber loss for stability (specifically for Adam)
loss_function = keras.losses.Huber()

In [44]:
while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action
            action = env.action_space.sample()
        else:
            state_tensor = keras.ops.convert_to_tensor(state)
            state_tensor = keras.ops.transpose([2,3,1])
            state_tensor = keras.ops.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = keras.ops.argmax(action_probs[0].numpy())
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)
        
        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = keras.ops.convert_to_tensor([float(done_history[i]) for i in indices])

            future_rewards = model_target.predict(state_next_sample, verbose=0)

            updated_q_values = rewards_sample + gamma * keras.ops.amax(future_rewards, axis=1)

            # If final frame, set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we can calculate loss
            masks = keras.ops.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states
                q_values = model(state_sample)
                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            print(f"Best score of last 100: {np.max(episode_reward_history)}, running reward: {running_reward} at episode {episode_count}, frame {frame_count}")
            model.save(f"breakout_qmodel_{episode_count}.keras")

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]
        
        if done:
            break
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1
    if running_reward > 40:
        print("Solved at episode {episode_count}!")
        model.save(f"breakout_qmodel_solved.keras")
        break
    
    if (max_episodes > 0 and episode_count >= max_episodes):
        print("Stopped at episode {episode_count}!")
        break
    if (max_frames > 0 and frame_count >= max_frames):
        print("Stopped at frame {frame_count}!")
        break

Best score of last 100: 5.0, running reward: 1.3584905660377358 at episode 53, frame 10000
Best score of last 100: 5.0, running reward: 1.22 at episode 107, frame 20000
Best score of last 100: 10.0, running reward: 1.28 at episode 161, frame 30000
Best score of last 100: 10.0, running reward: 1.11 at episode 218, frame 40000
Best score of last 100: 8.0, running reward: 1.06 at episode 274, frame 50000
Best score of last 100: 8.0, running reward: 1.2 at episode 327, frame 60000
Best score of last 100: 8.0, running reward: 1.34 at episode 380, frame 70000
Best score of last 100: 6.0, running reward: 1.31 at episode 435, frame 80000
Best score of last 100: 5.0, running reward: 1.26 at episode 489, frame 90000
Best score of last 100: 4.0, running reward: 1.06 at episode 547, frame 100000
Best score of last 100: 8.0, running reward: 1.14 at episode 600, frame 110000
Best score of last 100: 8.0, running reward: 1.53 at episode 649, frame 120000
Best score of last 100: 7.0, running reward: 1.

Traceback (most recent call last):
  File "C:\Users\user\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_8392\705780656.py", line 21, in <module>
    state_next, reward, done, _, _ = env.step(action)
                                     ^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\gymnasium\wrappers\frame_stack.py", line 179, in step
    observation, reward, terminated, truncated, info = self.env.step(action)
                                                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\gymnasium\wrappers\atari_preprocessing.py", line 184, in step
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\gymnasium\wrappers\atari_preprocessing.py", line 219, in _get_obs
Keyboard