# Lunar Lander With Deep Q-Learning

In this lab, we'll do some cool stuff!

In [1]:
import numpy as np

import io
import base64
from IPython import display

import gym
from gym import wrappers

from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [18]:
# Same as before, just allowing us to display the video from OpenAI Gym 
def imbed_round_video(video_env):
    video = io.open('./gym-videos/openaigym.video.%s.video000000.mp4' % video_env.file_infix, 'r+b').read()
    encoded = base64.b64encode(video)
    return display.HTML(data='''
        <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
    .format(encoded.decode('ascii')))

In [4]:
# With deep q learning we replace the Q-Table with 
# a deep neural network. This allows our model greater
# flexibility in choosing actions, and can better account
# for complex interactions between various state variables.
# It also eliminates the need for state-space discritization. 

# Feature engineering on the state-space may still be useful
# but it is no longer a requirement to get the algorithm working.

# We'll be reusing this model for this lab. The input shape (8,) 
# was chosen because that's the shape of the Lunar Lander environment's
# observations / state-space 
def fresh_model():
    model = Sequential()
    model.add(Dense(units=16, activation='relu', input_shape=(8,)))
    model.add(Dense(units=8, activation='relu'))
    
    # This is our output layer, 4 is chosen because that's 
    # how many actions we have access to in Lunar Lander
    # We're using a linear activation function, which reflects 
    # The fact that the desired predictions are the reward values
    # for each action, given the state as input. 
    model.add(Dense(units=4, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    return model
    

In [24]:
# So, lets try discritizing the lunar lander game and using Q-Learning!
environment = gym.make('LunarLander-v2')

# Instead of a Q-Table we're using our model
q_model = fresh_model()

# Some global parameters for Q-Learning
learning_rate = 0.1 
discount_factor = 0.95
exploration_rate = 0.3
training_episodes = 10000

# lets also track the average reward every so often
avg_reward = 0

for current_episode_num in range(training_episodes):
    state = environment.reset()

    done = False
    while not done:    
        # Now we have our model make a prediction, instead of
        # looking something up in the q-table. And we need these
        # values even if we explore randomly.
        
        # Some keras wonkeyness requires the np.array, and [0] to do a single 
        # prediction as opposed to a batch of predictions
        action_values = q_model.predict(np.array([state,]))[0]
        action = np.argmax(action_values)

        # We still have to explore the state space with DQN
        explore = np.random.random() < exploration_rate
        if explore:
            action = environment.action_space.sample()

        # Take the action, note we are discritizing again
        next_state, reward, done, _ = environment.step(action)
        
        prev_q_value = action_values[action]
        
        # Again, a little Keras uglyness to manage doing a single prediction
        discounted_future_reward = discount_factor * np.max(q_model.predict(np.array([next_state,]))[0])

        # Update the action values with our new information
        action_values[action] = (
            prev_q_value + (learning_rate * (reward + discounted_future_reward - prev_q_value))

        )
        
        q_model.fit(np.array([state,]), np.array([action_values,]), epochs=1, verbose=False)
        
        
    # Every time we finish an episode, log the final reward:
    avg_reward += reward
    if current_episode_num % 500 == 0:
        print("Finished episode: ", current_episode_num)
        print("  Avg. Reward=", avg_reward / 500, "\n")
        avg_reward = 0
    
print("finished!")

Finished episode:  0
  Avg. Reward= -0.2 

Finished episode:  500
  Avg. Reward= -100.0 

Finished episode:  1000
  Avg. Reward= -100.0 

Finished episode:  1500
  Avg. Reward= -100.0 

Finished episode:  2000
  Avg. Reward= -100.0 

Finished episode:  2500
  Avg. Reward= -100.0 

Finished episode:  3000
  Avg. Reward= -99.80382165821129 

Finished episode:  3500
  Avg. Reward= -100.0 

Finished episode:  4000
  Avg. Reward= -100.0 

Finished episode:  4500
  Avg. Reward= -100.0 

Finished episode:  5000
  Avg. Reward= -100.0 

Finished episode:  5500
  Avg. Reward= -100.0 

Finished episode:  6000
  Avg. Reward= -100.0 

Finished episode:  6500
  Avg. Reward= -100.0 

Finished episode:  7000
  Avg. Reward= -100.0 

Finished episode:  7500
  Avg. Reward= -100.0 

Finished episode:  8000
  Avg. Reward= -100.0 

Finished episode:  8500
  Avg. Reward= -100.0 

Finished episode:  9000
  Avg. Reward= -100.0 

Finished episode:  9500
  Avg. Reward= -100.0 

finished!


In [26]:
# Embed 20 attempts:
for _ in range(3):
    orig_environment = gym.make('LunarLander-v2')
    environment = wrappers.Monitor(orig_environment, "gym-videos/", force=True)

    # Lets visualize a single playthrough.
    state = environment.reset()

    done = False
    while not done:
        action_values = q_model.predict(np.array([state,]))[0]
        action = np.argmax(action_values)
        state, reward, done, _ = environment.step(action)

        # If the game finished before our max number of rounds, break out
        if done: break

    print("Final Reward: ", reward)
    environment.close()
    orig_environment.close()

    display.display(imbed_round_video(environment))

Final Reward:  -100


Final Reward:  -100


Final Reward:  -100
