# UCI COMPSCI 175, Spring 2023, Project 4: Implementing a Dueling Double Deep Q-Network (D3QN) into *Super Mario Bros.*

# NOTE TO GRADERS: 

## When hitting "Run All", the Notebook should load the model trained over 390 games, then render one game of the model playing World 1-1. Replay videos can also be found in the "videos" subdirectory of the submitted .zip file.

## If running this Notebook locally, i.e. on a Conda environment, the following libraries will need to be downloaded:
### - gym_super_mario_bros
### - keras
### - numpy
### - absl-py (Note: this may not be necessary depending on the keras version used)

## If, however, this is being run in Google Colab, the !pip install statement below should be the only necessary installation needed. However, the models and videos subdirectories from the .zip file will need to be uploaded to the runtime as well.

In [12]:
# %%capture
# !pip install gym_super_mario_bros

In [13]:
# MODIFY THESE IF RUNNING IN GOOGLE COLAB
output_dir = './models/'
video_dir = "./videos/"

## 1.) Import Necessary Libraries

In [14]:
# Gym imports
import gym
from gym import wrappers
from gym.wrappers import RecordVideo

# Super Mario Bros. Gym imports
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import RIGHT_ONLY

# Q-Network imports
from collections import deque
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Conv2D, Flatten

# Other libraries
import random
import numpy as np

## 2.) Create D3QN Class

In [15]:
class D3QN:
    """
    Our implementation of a Dueling Double Deep Q-Network that synchronizes with
    the gym-super-mario-bros Gym environment.
    """
    def __init__(self, states, actions):
        # Basic DQN parameters, derived from HuggingFace
        self.states = states
        self.actions = actions
        self.memory_buffer = deque(maxlen=2000) # Deque seems to work better than previous iteration's custom class
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.00001 # Keep the decay rate very small to avoid epsilon issue like in first implementation
        self.lr = 0.001 # Smallish learning rate: longer training times, projected better rewards
        
        self.network = self.build_network()
        self.target_network = self.build_network() # Double DQN, so create a target network too
        self.update_target_network()
        
        # To incentize the AI to jump more, instead of constantly running into pipes,
        # introduce a "jump preference" system to prefer jumping over running
        # in most cases.
        self.jump_multiplier = 1.5
        self.chained_jumps = 0

    def build_network(self):
        # Use three convolutional Keras layers and two Dense layers as the main network
        network = Sequential()
        
        # Previous implementation involved creating a custom Network class,
        # but these Keras layers seem to work fine, could potentially be
        # improved upon in later iterations
        # Used this documentation for reference: https://keras.io/api/layers/convolution_layers/convolution2d/
        network.add(Conv2D(32, 
                           kernel_size = (8, 8), 
                           strides = (4, 4), 
                           activation = "relu", 
                           input_shape = (self.states[0], self.states[1], 1)))
        network.add(Conv2D(64, kernel_size=(4, 4), strides=(2, 2), activation="relu"))
        network.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation="relu"))
        network.add(Flatten())
        
        # Add Dense layers
        # Used this documentation for reference: https://keras.io/api/layers/core_layers/dense/
        network.add(Dense(512, activation="relu"))
        network.add(Dense(self.actions + 1, activation="linear"))
        
        # Used this documentation for reference: https://keras.io/api/optimizers/
        network.compile(loss="mse", optimizer=Adam(lr=self.lr)) # Incoporate optimizer for better results
        return network
    
    def recall(self, state, action, reward, next_state, done):
        """Add a new entry to the DQN's memory buffer for later consultation"""
        self.memory_buffer.append((state, action, reward, next_state, done))

    def update_target_network(self):
        """Modify weights for target network based on Double DQN principles"""
        self.target_network.set_weights(self.network.get_weights())

    def act(self, state):
        if np.random.rand() <= self.epsilon: # Greedy epsilon
            self.epsilon -= self.epsilon_decay # Decay epsilon
            if self.epsilon < self.epsilon_min: self.epsilon = self.epsilon_min
            
            # Randomly pick an action across all possible actions in RIGHT_ONLY
            action = random.randrange(self.actions)
            
            # Sometimes, the model would run into a KeyError when picking
            # the last possible move, so this check is a simple way of
            # preventing this.
            if action >= 5: action = 4
            
            return action
        
        # Get move probabilities
        act_values = self.network.predict(state)
        
        # Increase running jump probability to increase pick likelihood
        jump_prob = act_values[0][4]
        jump_prob *= self.jump_multiplier ** self.chained_jumps      
        act_values[0][4] = jump_prob
        action = np.argmax(act_values[0])
        
        # Update chained_jumps based on the selected action
        if action == 4:
            self.chained_jumps += 1
            if self.chained_jumps >= 5: self.chained_jumps = 5
        else:
            self.chained_jumps = 0
            
        # Sometimes, the model would run into a KeyError when picking
        # the last possible move, so this check is a simple way of
        # preventing this.
        if action >= 5: action = 4
            
        return action

    def update(self, batch_max):
        # Get a random sample of past environment steps
        random_batch = random.sample(self.memory_buffer, batch_max)
        
        for state, action, reward, next_state, done in random_batch:
            target = self.network.predict(state)

            if action == 3:
                reward += 0.1  # Increase the reward for sprinting
            elif action == 4:
                reward += 0.2 # Greatly increase the reward for running jumps

            # Update network prediction
            if done:
                target[0][action] = reward
            else:
                x = np.argmax(self.network.predict(next_state)[0]) # TODO: Verify with Nathan if this seems sound
                y = self.target_network.predict(next_state)[0][x]
                target[0][action] = reward + self.gamma * y
                
            # TODO: Verbose = 0 still prints step progress lines,
            # determine what else could be causing the lines to print.
            self.network.fit(state, target, epochs=1, verbose=0)

    # Save system
    def save(self, directory, episode):
        # Using the pickle library caused several issues when loading,
        # so using the .h5 file format instead
        output_file = f"SuperMarioBros_Gym_D3QN_{episode}.h5"
        self.network.save_weights(directory + output_file)

    def load(self, directory, episode):
        # Using the pickle library caused several issues when loading,
        # so using the .h5 file format instead
        output_file = f"SuperMarioBros_Gym_D3QN_{episode}.h5"
        self.network.load_weights(directory + output_file)
        self.update_target_network()

## 3.) Create training parameters and environment

In [16]:
def env_maker():
    # Create base gym-super-mario-bros env
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)

    # Wrap the environment to preprocess observations
    # RGB info isn't important to the model, so grayscale frames
    env = wrappers.GrayScaleObservation(env)
    
    # Reduces file sizes when saving models
    # Inspired by Dueling DQN implementation from https://blog.paperspace.com/building-double-deep-q-network-super-mario-bros/
    env = wrappers.ResizeObservation(env, shape=(84, 84))
    
    return env

In [25]:
games_to_train = 1 # How many games to train on
trained_games = 390 # How many games have already been trained
batch_max = 64
save_iterations = 10

In [26]:
env = env_maker()
states = env.observation_space.shape
actions = env.action_space.n

## 4.) Initialize agent & load saved model if set to True

In [27]:
# Initialize D3QN agent
agent = D3QN(states, actions)

In [28]:
loading = True
if loading and trained_games > 0:
    # Load the saved model
    agent.load(output_dir, trained_games) # The 390 seemed to perform a bit better than the 400 one
    agent.epsilon = 0.1 # TODO: Tweak epsilon value based on how many games were already trained upon
    
    with open("./rewards.txt", "r") as f:
        reward_list = f.read()
        reward_list = reward_list.split("\n")
        reward_list = [float(reward) for reward in reward_list]

    with open("./rewards_plain.txt", "r") as f:
        rewards = f.read()
        rewards = rewards.split("\n")
        rewards = [float(reward) for reward in rewards]

## 5.) Train model if set to True, saving model every so often

In [29]:
training = False
if training and games_to_train > 0:
    # Training loop
    # reward_list = []
    # rewards = []

    for game in range(games_to_train):
        state = env.reset()
        state = np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]])
        done = False
        total_reward = 0

        while not done:
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            next_state = np.reshape(next_state, [1, next_state.shape[0], next_state.shape[1], next_state.shape[2]])
            
            agent.recall(state, action, reward, next_state, done) # Add to memory_buffer
            state = next_state

        if len(agent.memory_buffer) > batch_max:
            agent.update(batch_max)

        rewards.append(total_reward)
        reward_list.append(np.mean(rewards[-100:]))
        print(f"Game: {game + trained_games + 1}| Reward: {total_reward} | Average Reward: {np.mean(rewards[-100:])}")

        if (game + trained_games + 1) % save_iterations == 0: # Save every X iterations
            reward_list_str = [str(i) for i in reward_list]
            rewards_str = [str(i) for i in rewards]
            
            with open("./rewards.txt", "w") as f: f.write("\n".join(reward_list_str))
            with open("./rewards_plain.txt", "w") as f: f.write("\n".join(rewards_str))
            
            agent.save_model(output_dir, game + trained_games + 1)
            print(f"Finished saving agent for Game #{game + trained_games + 1}")

    env.close()







Game: 391| Reward: 629.0 | Average Reward: 731.61


## 6.) Record replay videos of trained agent

In [22]:
def video_recorder(env_name, model, video_dir, num_episodes=0):
    try:
        env = env_maker()
    
        # Add RecordVideo wrapper to save replay videos
        env = RecordVideo(env, video_dir)

        max_num_episodes = num_episodes if num_episodes > 0 else 50
        episode_counter = 0
        total_reward = 0

        # Stop recording either when the agent beats the level, or if
        # max_num_episodes number of games have been played
        while total_reward < 3000 and episode_counter < max_num_episodes:
            state = env.reset()
            state = np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]])
            done = False

            total_reward = 0
            while not done:
                env.render()
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                next_state = np.reshape(next_state, [1, next_state.shape[0], next_state.shape[1], next_state.shape[2]])
                state = next_state

            episode_counter += 1
            print(f"Game #{episode_counter}: Reward - {total_reward}")
    except KeyboardInterrupt: # Cut off recording early
        pass

    env.close()

video_recorder('SuperMarioBros-1-1-v0', agent, video_dir, 1)







Game #1: Reward - 1156.0
