Deep Q-Learning Lunar Lander

In [18]:
import time
from collections import deque, namedtuple

import gymnasium as gym
import numpy as np
import PIL.Image
import tensorflow as tf
import utils3
from PIL import Image

from pyvirtualdisplay import Display
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

In [19]:
# Instead of using Display, we can set the rendering mode directly in the environment
# when creating it. Remove this line:
# Display(visible=0, size=(840, 480)).start()

# Set the random seed for TensorFlow
tf.random.set_seed(utils3.SEED)

In [20]:
MEMORY_SIZE = 100_000     # size of memory buffer
GAMMA = 0.995             # discount factor
ALPHA = 1e-3              # learning rate
NUM_STEPS_FOR_UPDATE = 4  # perform a learning update every C time steps

Lunar Lander Environment

Load Env

In [22]:
# When creating the environment, you can specify render_mode='rgb_array' 
# or disable rendering if you don't need visualization
# Create environment with specified render mode
env = gym.make('LunarLander-v3', render_mode='rgb_array')

# Reset the environment
observation = env.reset()[0]  # [0] because reset now returns (observation, info)

# Render
frame = env.render()  # No mode parameter needed here
image = Image.fromarray(frame)

In [23]:
state_size = env.observation_space.shape
num_actions = env.action_space.n

print('State Shape:', state_size)
print('Number of actions:', num_actions)

State Shape: (8,)
Number of actions: 4


In [24]:
# Reset the environment and get the initial state.
current_state = env.reset()

In [None]:
# Select an action
action = 0

# Run a single time step of the environment's dynamics with the given action.
next_state, reward, done, _ = env.step(action)

# Display table with values.
utils3.display_table(current_state, action, next_state, reward, done)

# Replace the `current_state` with the state after the action is taken
current_state = next_state

Exercise 1

In [39]:
# UNQ_C1
# GRADED CELL

# Create the Q-Network
q_network = Sequential([
    ### START CODE HERE ###
    Input(shape=state_size),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_actions, activation='linear'),


    ### END CODE HERE ###
    ])
# Create the target Q^-Network
target_q_network = Sequential([
    Input(shape=state_size),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_actions, activation='linear'),



    ### END CODE HERE ###
    ])

### START CODE HERE ###
optimizer = Adam(learning_rate=ALPHA)
### END CODE HERE ###

In [None]:
# UNIT TEST
from public_tests5 import *

test_network(q_network)
test_network(target_q_network)
test_optimizer(optimizer, ALPHA)

In [41]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu, linear

def create_q_network():
    state_size = 8  # From the test requirements
    num_actions = 4  # From the test requirements
    
    model = Sequential([
        Dense(64, activation='relu', input_shape=(state_size,)),
        Dense(64, activation='relu'),
        Dense(num_actions, activation='linear')
    ])
    
    return model

# Create the networks
q_network = create_q_network()
target_q_network = create_q_network()

# Define optimizer

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Exercise 2

In [42]:
# UNQ_C2
# GRADED FUNCTION: calculate_loss

def compute_loss(experiences, gamma, q_network, target_q_network):
    """
    Calculates the loss.

    Args:
      experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
      gamma: (float) The discount factor.
      q_network: (tf.keras.Sequential) Keras model for predicting the q_values
      target_q_network: (tf.keras.Sequential) Keras model for predicting the targets

    Returns:
      loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
            the y targets and the Q(s,a) values.
    """

    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, done_vals = experiences

    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)

    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
    ### START CODE HERE ###
    y_targets = rewards + (gamma * max_qsa * (1 - done_vals))
    ### END CODE HERE ###

    # Get the q_values and reshape to match y_targets
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))

    # Compute the loss
    ### START CODE HERE ###
    loss = MSE(y_targets, q_values)
    ### END CODE HERE ###

    return loss

In [43]:
# UNIT TEST
test_compute_loss(compute_loss)

[92mAll tests passed!
