# UCI COMPSCI 175, Spring 2023, Project 4: Implementing a Dueling Double Deep Q-Network (D3QN) into *Super Mario Bros.*

# NOTE TO GRADERS: When hitting "Run All", the Notebook should load the model trained over 390 games, then render one game of the model playing World 1-1. Replay videos can also be found in the "videos" subdirectory of the submitted .zip file.

## 1.) Import Necessary Libraries

In [11]:
# Gym imports
import gym
from gym import wrappers
from gym.wrappers import Monitor

# Super Mario Bros. Gym imports
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import RIGHT_ONLY

# Q-Network imports
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Conv2D, Flatten
from collections import deque

# Other libraries
import numpy as np
import random

## 2.) Create D3QN Class

In [12]:
class D3QN:
    """
    Our implementation of a Dueling Double Deep Q-Network that synchronizes with
    the gym-super-mario-bros Gym environment.
    """
    def __init__(self, state_shape, action_size):
        # Basic DQN parameters, derived from HuggingFace
        self.state_shape = state_shape
        self.action_size = action_size
        self.memory_buffer = deque(maxlen=2000) # Deque seems to work better than previous iteration's custom class
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.00001 # Keep the decay rate very small to avoid epsilon issue like in first implementation
        self.learning_rate = 0.001 # Smallish learning rate: longer training times, projected better rewards
        
        self.model = self.build_network()
        self.target_network = self.build_network() # Double DQN, so create a target network too
        self.update_target_network()
        
        # To incentize the AI to jump more, instead of constantly running into pipes,
        # introduce a "jump preference" system to prefer jumping over running
        # in most cases.
        self.jump_multiplier = 1.5
        self.chained_jumps = 0

    def build_network(self):
        # Use three convolutional Keras layers and two Dense layers as the main network
        model = Sequential()
        
        # Previous implementation involved creating a custom Network class,
        # but these Keras layers seem to work fine, could potentially be
        # improved upon in later iterations
        # Used this documentation for reference: https://keras.io/api/layers/convolution_layers/convolution2d/
        model.add(Conv2D(32, kernel_size=(8, 8), strides=(4, 4), activation='relu', input_shape=(self.state_shape[0], self.state_shape[1], 1)))
        model.add(Conv2D(64, kernel_size=(4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        
        # Add Dense layers
        # Used this documentation for reference: https://keras.io/api/layers/core_layers/dense/
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size + 1, activation='linear'))
        
        # Used this documentation for reference: https://keras.io/api/optimizers/
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # Incoporate optimizer for better results
        return model
    
    def remember(self, state, action, reward, next_state, done):
        """Add a new entry to the DQN's memory buffer for later consultation"""
        self.memory_buffer.append((state, action, reward, next_state, done))

    def update_target_network(self):
        """Modify weights for target network based on Double DQN principles"""
        self.target_network.set_weights(self.model.get_weights())

    def act(self, state):
        if np.random.rand() <= self.epsilon: # Greedy epsilon
            self.epsilon -= self.epsilon_decay # Decay epsilon
            if self.epsilon < self.epsilon_min: self.epsilon = self.epsilon_min
            
            # Randomly pick an action across all possible actions in RIGHT_ONLY
            action = random.randrange(self.action_size)
            
            # Sometimes, the model would run into a KeyError when picking
            # the last possible move, so this check is a simple way of
            # preventing this.
            if action >= 5: action = 4
            
            return action
        
        # Get move probabilities
        act_values = self.model.predict(state)
        
        # Increase running jump probability to increase pick likelihood
        jump_prob = act_values[0][4]
        jump_prob *= self.jump_multiplier ** self.chained_jumps      
        act_values[0][4] = jump_prob
        action = np.argmax(act_values[0])
        
        # Update chained_jumps based on the selected action
        if action == 4:
            self.chained_jumps += 1
            if self.chained_jumps >= 5: self.chained_jumps = 5
        else:
            self.chained_jumps = 0
            
        # Sometimes, the model would run into a KeyError when picking
        # the last possible move, so this check is a simple way of
        # preventing this.
        if action >= 5: action = 4
            
        return action

    def update(self, batch_max):
        # Get a random sample of past environment steps
        random_batch = random.sample(self.memory_buffer, batch_max)
        
        for state, action, reward, next_state, done in random_batch:
            target = self.model.predict(state)

            if action == 3:
                reward += 0.1  # Increase the reward for sprinting
            elif action == 4:
                reward += 0.2 # Greatly increase the reward for running jumps

            # Update model prediction
            if done:
                target[0][action] = reward
            else:
                x = np.argmax(self.model.predict(next_state)[0])
                y = self.target_network.predict(next_state)[0][x]
                target[0][action] = reward + self.gamma * y
                
            # TODO: Verbose = 0 still prints step progress lines,
            # determine what else could be causing the lines to print.
            self.model.fit(state, target, epochs=1, verbose=0)

    # Save system
    def save(self, directory, episode):
        # Using the pickle library caused several issues when loading,
        # so using the .h5 file format instead
        output_file = f"SuperMarioBros_Gym_D3QN_{episode}.h5"
        self.model.save_weights(directory + output_file)

    def load(self, directory, episode):
        # Using the pickle library caused several issues when loading,
        # so using the .h5 file format instead
        output_file = f"SuperMarioBros_Gym_D3QN_{episode}.h5"
        self.model.load_weights(directory + output_file)
        self.update_target_network()

## 3.) Create training parameters and environment

In [13]:
def env_maker():
    # Create base gym-super-mario-bros env
    env = gym.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)

    # Wrap the environment to preprocess observations
    # RGB info isn't important to the model, so grayscale frames
    env = wrappers.GrayScaleObservation(env)
    
    # Reduces file sizes when saving models
    # Inspired by Dueling DQN implementation from https://blog.paperspace.com/building-double-deep-q-network-super-mario-bros/
    env = wrappers.ResizeObservation(env, shape=(84, 84))
    
    return env

In [14]:
n_episodes = 200 # How many games to train on
output_dir = './models/'
batch_max = 64

In [15]:
env = env_maker()
state_shape = env.observation_space.shape
action_size = env.action_space.n

## 4.) Initialize agent & load saved model if set to True

In [16]:
# Initialize D3QN agent
agent = D3QN(state_shape, action_size)

In [17]:
loading = True
if loading:
    # Load the saved model
    agent.load(output_dir, 390) # The 390 seemed to perform a bit better than the 400 one
    agent.epsilon = 0.1 # TODO: Tweak epsilon value based on how many games were already trained upon
    
    with open("./rewards.txt", "r") as f:
        reward_list = f.read()
        reward_list = reward_list.split("\n")
        reward_list = [float(reward) for reward in reward_list]

    with open("./rewards_plain.txt", "r") as f:
        rewards = f.read()
        rewards = rewards.split("\n")
        rewards = [float(reward) for reward in rewards]

## 5.) Train model if set to True, saving model every so often

In [18]:
training = False
if training:
    # Training loop
    # reward_list = []
    # rewards = []

    for episode in range(n_episodes):
        state = env.reset()
        state = np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]])
        done = False
        total_reward = 0

        while not done:
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            next_state = np.reshape(next_state, [1, next_state.shape[0], next_state.shape[1], next_state.shape[2]])
            agent.remember(state, action, reward, next_state, done)
            state = next_state

        if len(agent.memory_buffer) > batch_max:
            agent.update(batch_max)

        rewards.append(total_reward)
        reward_list.append(np.mean(rewards[-100:]))
        print(f"Episode: {episode + 201}, Total Reward: {total_reward}, Average Reward: {np.mean(rewards[-100:])}, Epsilon: {agent.epsilon}")

        if (episode + 201) % 10 == 0:
            reward_list_str = [str(i) for i in reward_list]
            rewards_str = [str(i) for i in rewards]
            
            with open("./rewards.txt", "w") as f: f.write("\n".join(reward_list_str))
                
            with open("./rewards_plain.txt", "w") as f: f.write("\n".join(rewards_str))
            
            agent.save_model(output_dir, episode + 201)
            print(f"Saved agent for Game #{episode + 201}")

    env.close()

## 6.) Record replay videos of trained agent

In [19]:
def video_recorder(env_name, model, video_dir, num_episodes=0):
    try:
        env = env_maker()
    
        # Add Monitor wrapper to save replay videos
        env = Monitor(env, video_dir, video_callable=lambda episode_id: True, force=True)

        max_num_episodes = num_episodes if num_episodes > 1 else 50
        episode_counter = 0
        total_reward = 0

        # Stop recording either when the agent beats the level, or if
        # max_num_episodes number of games have been played
        while total_reward < 3000 and episode_counter < max_num_episodes:
            state = env.reset()
            state = np.reshape(state, [1, state.shape[0], state.shape[1], state.shape[2]])
            done = False

            total_reward = 0
            while not done:
                env.render()
                action = agent.act(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                next_state = np.reshape(next_state, [1, next_state.shape[0], next_state.shape[1], next_state.shape[2]])
                state = next_state

            episode_counter += 1
            print(f"Game #{episode_counter}: Reward - {total_reward}")
    except KeyboardInterrupt: # Cut off recording early
        pass

    env.close()

video_dir = "./videos/"
video_recorder('SuperMarioBros-1-1-v0', agent, video_dir, 1)



Exception ignored in: <function Monitor.__del__ at 0x000001A4E8360700>
Traceback (most recent call last):
  File "C:\Users\rande\AppData\Roaming\Python\Python310\site-packages\gym\wrappers\monitor.py", line 289, in __del__
    self.close()
  File "C:\Users\rande\AppData\Roaming\Python\Python310\site-packages\gym\wrappers\monitor.py", line 178, in close
    super(Monitor, self).close()
  File "C:\Users\rande\AppData\Roaming\Python\Python310\site-packages\gym\core.py", line 298, in close
    return self.env.close()
  File "C:\Users\rande\AppData\Roaming\Python\Python310\site-packages\gym\core.py", line 298, in close
    return self.env.close()
  File "C:\Users\rande\AppData\Roaming\Python\Python310\site-packages\gym\core.py", line 298, in close
    return self.env.close()
  [Previous line repeated 2 more times]
  File "E:\Anaconda\Installation\envs\NES\lib\site-packages\nes_py\nes_env.py", line 346, in close
    raise ValueError('env has already been closed.')
ValueError: env has already