In [167]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
import time
import random
import numpy as np
from collections import deque
from matplotlib import pyplot as plt
from tensorflow.keras import layers, models
from tensorflow.keras.losses import Huber

nb_actions = len(RIGHT_ONLY)

In [168]:
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
import tensorflow as tf2

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        return tf2.image.rgb_to_grayscale(observation)

class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = shape
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        return tf2.squeeze(tf2.image.resize(observation, size=self.shape))


In [171]:
def copy_weights(source, destination):
    destination.set_weights(source.get_weights())
    
def create_model():
    Q_star = models.Sequential()
    Q_star.add(layers.Input((84, 84, 4)))
    Q_star.add(layers.Conv2D(filters=32, kernel_size=8, strides=4, activation='relu'))
    Q_star.add(layers.Conv2D(filters=64, kernel_size=4, strides=2, activation='relu'))
    Q_star.add(layers.Conv2D(filters=64, kernel_size=3, strides=1, activation='relu'))
    Q_star.add(layers.Flatten())
    #Q_star.add(layers.Dense(512, activation='relu'))
    Q_star.add(layers.Dense(nb_actions))
    
    Q_star.compile(optimizer='adam',
        loss=Huber())

    return Q_star

class DQNAgent:
    """ DQN agent """
    def __init__(self, states, actions, max_memory, double_q):
        self.states = states
        self.actions = actions
        self.build_model()
        self.memory = deque(maxlen=max_memory)
        self.eps = 1
        self.eps_decay = 0.99999975 # linear decrease epsilon
        self.eps_min = 0.1
        self.gamma = 0.90
        self.batch_size = 32
        self.burnin = 500 # after this number of frames the agent starts to replace random actions by policy
        self.copy = 10000 # copy online into target every 10000 steps
        self.step = 0 # step counter
        self.learn_each = 3
        self.learn_step = 0
        self.save_each = 500000
        self.double_q = double_q

    def build_model(self):
        """ Model builder function """
        self.output = create_model()
        self.output_target = create_model()
        self.copy_model()

    def copy_model(self):
        """ Copy weights to target network """
        copy_weights(self.output, self.output_target)

    def add(self, experience):
        """ Add observation to experience """
        self.memory.append(experience)

    def predict(self, model, state):
        """ Prediction """
        if model == 'online':
            return self.output.predict(np.array([np.array(state, dtype=np.uint8).T]))
        if model == 'target':
            return self.output_target.predict(np.array(state))

    def run(self, state):
        """ Perform action """
        if np.random.rand() < self.eps:
            # Random action
            action = np.random.randint(low=0, high=self.actions)
        else:
            # Policy action
            q = self.predict('online', state)
            action = np.argmax(q)
        # Decrease eps
        if self.step >= self.burnin:
            self.eps *= self.eps_decay
            self.eps = max(self.eps_min, self.eps)
        # Increment step
        self.step += 1
        return action

    def learn(self):
        """ Gradient descent """
        # Sync target network
        if self.step % self.copy == 0:
            self.copy_model()
        # Checkpoint model
        # TODO : save model
        # Break if burn-in
        if self.step < self.burnin:
            return
        # Break if no training
        if self.learn_step < self.learn_each:
            self.learn_step += 1
            return
        # Sample batch from memory
        batch = random.sample(agent.memory, agent.batch_size)
        state, next_state, action, reward, done = map(np.array, zip(*batch))
        features = np.array([A.T for A in next_state])
        # Compute estimated reward for each state un the batch (building targets to learn with)
        predictions = agent.output_target.predict(features)
        targets = reward + (1 - done) * agent.gamma * np.amax(predictions, axis=1)
        agent.output.fit(x=features, y=targets, batch_size=agent.batch_size, epochs=1, verbose=0)
        # Reset learn step
        self.learn_step = 0

In [172]:
import time
import numpy as np
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY

# Build env (first level, right only)
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, RIGHT_ONLY)
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=(84,84))
env = FrameStack(env, num_stack=4)

# Parameters
states = (84, 84, 4)
actions = nb_actions

# Agent
agent = DQNAgent(states=states, actions=actions, max_memory=10000, double_q=True)

# Episodes
episodes = 10
rewards = []

# Timing
start = time.time()
step = 0

# Main loop
for e in range(episodes):

    # Reset env
    state = env.reset()

    # Reward
    total_reward = 0
    iter = 0

    # Play
    while True:

        # Show env (disabled)
        env.render()

        # Run agent
        action = agent.run(state=state)

        # Perform action
        next_state, reward, done, info = env.step(action=action)

        # Remember transition
        agent.add(experience=(state, next_state, action, reward, done))

        # Update agent
        agent.learn()

        # Total reward
        total_reward += reward

        # Update state
        state = next_state

        # Increment
        iter += 1

        # If done break loop
        if done or info['flag_get']:
            print(done)
            break

    # Rewards
    rewards.append(total_reward / iter)

    # Print
    if e % 1 == 0:
        print('Episode {e} - +'
              'Frame {f} - +'
              'Frames/sec {fs} - +'
              'Epsilon {eps} - +'
              'Mean Reward {r}'.format(e=e,
                                       f=agent.step,
                                       fs=np.round((agent.step - step) / (time.time() - start)),
                                       eps=np.round(agent.eps, 4),
                                       r=np.mean(rewards[-100:])))
        start = time.time()
        step = agent.step

env.close()

True
Episode 0 - +Frame 36 - +Frames/sec 88.0 - +Epsilon 1 - +Mean Reward 6.527777777777778
True
Episode 1 - +Frame 191 - +Frames/sec 101.0 - +Epsilon 1 - +Mean Reward 5.3187275985663085
True
Episode 2 - +Frame 342 - +Frames/sec 103.0 - +Epsilon 1 - +Mean Reward 5.85045416063361
True
Episode 3 - +Frame 428 - +Frames/sec 106.0 - +Epsilon 1 - +Mean Reward 6.178538294893812
True
Episode 4 - +Frame 583 - +Frames/sec 51.0 - +Epsilon 1.0 - +Mean Reward 5.99057257139892
True
Episode 5 - +Frame 960 - +Frames/sec 39.0 - +Epsilon 0.9999 - +Mean Reward 5.331666355918199
True
Episode 6 - +Frame 1704 - +Frames/sec 41.0 - +Epsilon 0.9997 - +Mean Reward 4.747035063905307
True
Episode 7 - +Frame 2485 - +Frames/sec 38.0 - +Epsilon 0.9995 - +Mean Reward 4.263610866576554
True
Episode 8 - +Frame 2675 - +Frames/sec 40.0 - +Epsilon 0.9995 - +Mean Reward 4.226718431108984
True
Episode 9 - +Frame 2957 - +Frames/sec 40.0 - +Epsilon 0.9994 - +Mean Reward 4.070713254664752
