In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras import layers
from gymnasium.wrappers import FrameStack
from gymnasium.experimental.wrappers import GrayscaleObservationV0
import gymnasium as gym
import numpy as np
import tensorflow as tf

In [2]:
env = gym.make('ALE/Frogger-v5') # Human render mode slows things down A LOT

In [3]:
env = GrayscaleObservationV0(env) # Remove RGB channels (make gray) in order to decrease amount of data to process
env.observation_space.shape

(210, 160)

In [4]:
seed = 42 # Allows us to repeat the same patterns of game play
env = FrameStack(env, 4) # Get 4 frames from game at a time
frames, width, height = env.observation_space.shape
env.seed(seed)

  logger.warn(


(3444837047, 2669555309)

In [5]:
# This network learns an approximation of the Q-table, which is a mapping between the states and actions that an agent will take. 
# For every state we'll have FIVE actions that can be taken (NOOP, Up, Down, Left, Right). 
# The environment provides the state, and the action is chosen by selecting the largest of the five Q-values predicted in the output layer of the CNN.

num_actions = 5

def create_CNN():
    inputShape = env.observation_space.shape
    model = Sequential()
    
    model.add(Conv2D(32, kernel_size=(8, 8), strides=(4,4), activation='relu', input_shape=[]))
    model.add(Conv2D(32, kernel_size=(8, 8), strides=(4,4), activation='relu', input_shape=[]))
    model.add(Conv2D(32, kernel_size=(8, 8), strides=(4,4), activation='relu', input_shape=[]))
    
    model.add(Conv2D(32, kernel_size=(8, 8), strides=(4,4), activation='relu', input_shape=inputShape))
    return None

In [6]:
# The first neural net makes the predictions for Q-values, which are used to take an action.
cnn1 = create_CNN()

# A second cnn is used to predict future rewards. The weights of the second cnn get updated every 10000 steps.
cnn2 = create_CNN()

  super().__init__(**kwargs)
  super().__init__(


In [7]:
# Hyperparamaters DQN Algorithm

gamma = 0.99  # Discount factor in Bellman's equuation
epsilon = 1  # Epsilon greedy parameter for Q learning algorithm
max_steps_per_episode = 75 #Deepmind trained for "a total of 50 million frames (~38 days of game play)"
max_episodes = 1000  # Number of episodes you let the AI train. Keep above 1!
epsilon_min = 0.1  # Smallest epsilon value possible
epsilon_max = 1.0  # Largest epsilon value possible
epsilon_interval = (epsilon_max - epsilon_min)  # Rate we reduce chance of random action being taken (eventually, we don't want to take many random actions)

# Somre more important variables
batch_size = 32  # Size of sample taken from "replay buffer"
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0
# Maximum replay length
# Note: A Deepmind paper suggests 1000000, however this can cause memory issues
max_memory_length = 100000
# Train the model after 4 actions
update_after_actions = 4
# How often to update cnn2
update_cnn2 = 10000
# Using huber loss to check for convergance of Qs
loss_function = keras.losses.Huber()

In [8]:
# DQN Algorithm
while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0
    totalSteps = 0
    
    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        # Use epsilon-greedy policy to explore or exploit
            # If current frame_count is less than 50,000 or epsilon is greater than a random number between 0 and 1
        print("Frame Count: ", frame_count)
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            # Predict Q-values from environment state
            state_tensor = keras.ops.convert_to_tensor(state) # Turn state (frame in game) into a Tensor Object (think matrix)
            state_tensor = keras.ops.expand_dims(state_tensor, 0) # Add to the current batch
            action_probs = cnn1(state_tensor, training=False)
            print(action_probs)
            # Take best action
            action = keras.ops.argmax(action_probs[0]).numpy()
        
        # Decrease probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)
        
        # Take action in environment
        print("Action: ", action)
        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)
        
        # Sum rewards across entire episode
        totalSteps += 1
        episode_reward += reward

        # Save actions and states in "replay buffer"
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next
        
        # Update every fourth frame AND once batch size is greater than 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = keras.ops.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            # Build the updated Q-values for the sampled future states using cnn2
            future_rewards = cnn2.predict(state_next_sample)
            # Q value = reward + discount factor * expected future reward (Bellman's Equation)
            updated_q_values = rewards_sample + gamma * keras.ops.amax(future_rewards, axis=1)

            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a 'mask' (matrix of 0s and 1s) so we only calculate loss on the updated Q-values
            masks = keras.ops.one_hot(action_sample, num_actions)
            
            # Train the cnn1 using updated Q-values
            with tf.GradientTape() as tape:
                q_values = cnn1(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation - how cnn1 is updated
            grads = tape.gradient(loss, cnn1.trainable_variables)
            optimizer.apply_gradients(zip(grads, cnn1.trainable_variables))
        
        # Time to update cnn2?
        if frame_count % update_cnn2 == 0:
            # update cnn2 with new weights
            cnn2.set_weights(cnn1.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            #print(template.format(running_reward, episode_count, frame_count))

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            print("DONE")
            break

    # Update running reward to check condition for solving
    s = episode_reward
    x = totalSteps
    actualScore = abs(10*(s/(x-(1.75*s))))
    
    if actualScore > s:
        actualScore = s
    
    episode_reward_history.append(actualScore)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    
    running_reward = np.mean(episode_reward_history)
    print("Average Reward Across All Episodes: ", running_reward)
    episode_count += 1

    if running_reward > 40:  # Condition to consider game "learned"
        print("Learned at episode {}!".format(episode_count))
        break

    if (max_episodes > 0 and episode_count >= max_episodes):  # Maximum number of episodes reached
        print("Stopped at episode {}!".format(episode_count))
        break

Frame Count:  1
Action:  1
Frame Count:  2
Action:  1
Frame Count:  3
Action:  0
Frame Count:  4
Action:  3
Frame Count:  5
Action:  1
Frame Count:  6
Action:  1
Frame Count:  7
Action:  0
Frame Count:  8
Action:  4
Frame Count:  9
Action:  3
Frame Count:  10
Action:  2
Frame Count:  11
Action:  0
Frame Count:  12
Action:  3
Frame Count:  13
Action:  2
Frame Count:  14
Action:  4
Frame Count:  15
Action:  4
Frame Count:  16
Action:  2
Frame Count:  17
Action:  0
Frame Count:  18
Action:  4
Frame Count:  19
Action:  0
Frame Count:  20
Action:  2
Frame Count:  21
Action:  1
Frame Count:  22
Action:  4
Frame Count:  23
Action:  0
Frame Count:  24
Action:  4
Frame Count:  25
Action:  0
Frame Count:  26
Action:  4
Frame Count:  27
Action:  1
Frame Count:  28
Action:  0
Frame Count:  29
Action:  4
Frame Count:  30
Action:  2
Frame Count:  31
Action:  4
Frame Count:  32
Action:  0
Frame Count:  33
Action:  3
Frame Count:  34
Action:  4
Frame Count:  35
Action:  3
Frame Count:  36
Action:  3


KeyboardInterrupt: 

In [9]:
#cnn1.save_weights("cnn1v2.weights.h5") # Save the weights of each CNN
#cnn2.save_weights("cnn2v2.weights.h5")

In [11]:
model = create_CNN()
model.load_weights('cnn1.weights.h5')
env = gym.make('ALE/Frogger-v5', render_mode="human")
env = GrayscaleObservationV0(env)
env = FrameStack(env, 4) # Get 4 frames from game at a time
frames, width, height = env.observation_space.shape
env.reset()
game_over = False

In [12]:
state, reward, game_over, x, _ = env.step(0)
while not game_over:
    
    #Predict action using the trained model
    q_values = model.predict(np.expand_dims(state, axis=0))
    action = np.argmax(q_values)

    # Take action in the environment
    next_state, reward, game_over, x, _ = env.step(action)

    # Update current state
    state = next_state

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15