In [3]:
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output
from tensorflow.keras.models import save_model, load_model
import time

In [4]:
# gym environment

env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(env, RIGHT_ONLY)

In [6]:
# take random actions

total_reward = 0
done = True

for step in range(100000):
    env.render()

    if done:
        state = env.reset()

    state, reward, done, info = env.step(env.action_space.sample())
    print(info)
    total_reward += reward
    clear_output(wait=True)

env.close()

KeyboardInterrupt: 

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # agent variables
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)

        # exploration vs exploitation
        self.epsilon = 1
        self.max_exploration = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = .0001

        # NN
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()  # set weights of main net to target network

    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same', input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (4,4), strides=2, padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))

        model.add(Dense(self.action_space, activation='linear'))

        model.compile(loss='mse', optimizer=Adam())

        return model

    def update_target_network(self):
        # avoid oscillation
        self.target_network.set_weights(self.main_network.get_weights())

    def act(self, state):

        # epsilon greedy if eps is large take random action else take prediction using main net
        if random.uniform(0, 1) < self.epsilon:
            return np.random.randint(self.action_space)  # env.action_space.sample

        Q_value = self.main_network.predict(state)

        return np.argmax(Q_value[0])

    def update_epsilon(self, episode):
        # decays epsilon
        self.epsilon = self.min_epsilon + (self.max_exploration - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)
