In [141]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

import numpy as np
import matplotlib.pyplot as plt

nb_actions = len(gym_super_mario_bros.actions.SIMPLE_MOVEMENT)

def extract_data_from_ram(ram):
    return np.hstack((
        ram[0x0003], 
        ram[0x000E], 
        ram[0x000F:0x0013], 
        ram[0x0016:0x001A],
        ram[0x001D],
        ram[0x001E:0x0023],
        ram[0x0033],
        ram[0x0045],
        ram[0x0046:0x004A],
        ram[0x004B],
        ram[0x0057],
        ram[0x0058:0x005C],
        ram[0x006D],
        ram[0x006E:0x0072],
        ram[0x0086],
        ram[0x0087:0x008B],
        ram[0x008C],
        ram[0x008D],
        ram[0x00D5],
        ram[0x009F],
        ram[0x00A0:0x00A4],
        ram[0x00B5],
        ram[0x00B6:0x00BA],
        ram[0x00BB],
        ram[0x00CE],
        ram[0x00CF:0x00D3],
        ram[0x03AD],
        ram[0x03AE:0x03B2],
        ram[0x03B3],
        ram[0x03B8],
        ram[0x03B9:0x03BD],
        ram[0x03BE],
        ram[0x03AF],
        ram[0x03BA],
        ram[0x0400],
        ram[0x04AC:0x04AF],
        ram[0x04B0:0x04C3],
        ram[0x04AC:0x04AF],
        ram[0x04C4:0x04C7],
        ram[0x04D0:0x04DF],
        ram[0x0705],
        ram[0x0754],
        ram[0x0756],
        ))

(118,)

In [142]:
# Initializing environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')
ram = env.ram
env = JoypadSpace(env, gym_super_mario_bros.actions.SIMPLE_MOVEMENT)

# Initializing hyper-parameters
max_epsilon = 1 # Initial value for epsilon greedy exploration
min_epsilon = 0.01 # Final value for epsilon greedy exploration
epsilon_decay_steps = 500000
gamma = 0.99 # Coefficient d'actualisation
batch_size = 32
agent_history_length = 4
target_network_update_frequency = 500

learning_rate = 0.00025
gradient_momentum = 0.95
squared_gradient_momentum = 0.95
min_squared_gradient = 0.01

state_dim = 118
N_start = 5000
N = 10000 # Capacity of replay memory

# Initializing replay memory
from collections import deque

D = deque(maxlen=N)

In [143]:
# Initializing network
import tensorflow as tf
from tensorflow.keras.models import clone_model, Sequential
from tensorflow.keras.layers import Dense, Conv2D, Conv1D, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import Model

def build_model(name='Q'):
    Q_star = Sequential(name=name)
    Q_star.add(Input(shape=(state_dim,)))
    Q_star.add(Dense(state_dim // 8, activation='relu'))
    Q_star.add(Dense(state_dim // 16, activation='relu'))
    Q_star.add(Dense(state_dim // 32, activation='relu'))
    Q_star.add(Dense(nb_actions, activation='linear'))
    return Q_star

def copy_weights(source, destination):
    destination.set_weights(source.get_weights())

def get_initialized_networks():
    loss_object = tf.keras.losses.MeanSquaredError()
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, momentum=gradient_momentum)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    Q_star = build_model('Q_star')
    Q_star_minus = build_model('Q_star_minus')

    Q_star.compile(optimizer=optimizer,
        loss=loss_object,
        metrics=['accuracy'])
    copy_weights(Q_star, Q_star_minus)
    return Q_star, Q_star_minus 

In [144]:
# Get initialized networks Q* and Q*-
Q_star, Q_star_minus = get_initialized_networks()

In [145]:
from collections import namedtuple

Transition = namedtuple('Transition', ["state", "action", "reward", "next_state", "done"])

# resetting environnement state
env.reset()
new_state = extract_data_from_ram(ram)

def add_to_replay(D, transition):
        D.append(transition)
    
D = deque()

# starting to build experience replay from random behavior
for i in range(N_start):
    action = env.action_space.sample()
    old_state = new_state
    state, reward, done, info = env.step(action)
    new_state = extract_data_from_ram(ram)
    D.append(Transition(old_state, action, reward, new_state, done))

In [157]:
import random

# Training the model now that our experience replay is partially filled
nb_episodes = 10000
episode_rewards = []
best_episode_reward = 0

epsilons = np.linspace(min_epsilon, max_epsilon, epsilon_decay_steps)
frame_counter = 0

for i in range(nb_episodes):
    env.reset()
    new_state = extract_data_from_ram(ram)
    loss = None
    done = False
    r_sum = 0
    mean_episode_reward = 0

    if episode_rewards:
        mean_episode_reward = np.mean(episode_rewards)
    if best_episode_reward < mean_episode_reward:
        best_episode_reward = mean_episode_reward

    while not done:
        # Get epsilon for this step => epsilon greedy policy
        epsilon = epsilons[-frame_counter if frame_counter < epsilon_decay_steps else 0]
        # Update target network Q_star_minus
        if frame_counter % target_network_update_frequency == 0:
            copy_weights(Q_star, Q_star_minus)
        print("\r Epsilon ({}) ReplayMemorySize : ({}) rSum: ({}) best_epi_reward: ({}) OptiStep ({}) @ Episode {}/{}, loss: {}".format(epsilon, len(D), mean_episode_reward, best_episode_reward, frame_counter, i + 1, nb_episodes, loss), end="")
        sys.stdout.flush()

        # Select action using epsilon greedy policy
        random_value = random.random()
        if random_value < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_star.predict(state.reshape(1, state_dim)))

        _, reward, done, info = env.step(action)
        old_state = new_state
        new_state = extract_data_from_ram(ram)
        r_sum += reward

        add_to_replay(D, Transition(old_state, action, reward, new_state, done))

        #TODO : learning

 Epsilon (0.01) ReplayMemorySize : (6137) rSum: (0) best_epi_reward: (0) OptiStep (0) @ Episode 1/10000, loss: NoneException ignored in: <function IteratorResourceDeleter.__del__ at 0x7febd86a2550>
Traceback (most recent call last):
  File "/home/seladus/.local/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 537, in __del__
    gen_dataset_ops.delete_iterator(
  File "/home/seladus/.local/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1137, in delete_iterator
    _result = pywrap_tfe.TFE_Py_FastPathExecute(
KeyboardInterrupt: 
 Epsilon (0.01) ReplayMemorySize : (6375) rSum: (0) best_epi_reward: (0) OptiStep (0) @ Episode 1/10000, loss: None

KeyboardInterrupt: 