In [1]:
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output
from tensorflow.keras.models import save_model, load_model
import time

2022-03-03 05:13:56.711278: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2022-03-03 05:13:56.711351: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# gym environment

env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(env, RIGHT_ONLY)

In [6]:
# take random actions

total_reward = 0
done = True

for step in range(100000):
    env.render()

    if done:
        state = env.reset()

    state, reward, done, info = env.step(env.action_space.sample())
    print(info)
    total_reward += reward
    clear_output(wait=True)

env.close()

KeyboardInterrupt: 

In [10]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # agent variables
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)
        self.gamma = .8

        # exploration vs exploitation
        self.epsilon = 1
        self.max_exploration = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = .0001

        # NN
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()  # set weights of main net to target network

    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same', input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (4,4), strides=2, padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))

        model.add(Dense(self.action_space, activation='linear'))

        model.compile(loss='mse', optimizer=Adam())

        return model

    def update_target_network(self):
        # avoid oscillation
        self.target_network.set_weights(self.main_network.get_weights())

    def act(self, state):

        # epsilon greedy if eps is large take random action else take prediction using main net
        if random.uniform(0, 1) < self.epsilon:
            return np.random.randint(self.action_space)  # env.action_space.sample

        Q_value = self.main_network.predict(state)

        return np.argmax(Q_value[0])

    def update_epsilon(self, episode):
        # decays epsilon
        self.epsilon = self.min_epsilon + (self.max_exploration - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)

    def train(self, batch_size):
        minibatch = random.shuffle(self.memory, batch_size)  # minibatch from memory

        # get variables from batch to find q-value
        for state, action, reward, next_state, done in minibatch:
            target = self.main_network.predict(state)

            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))  # avoid oscilation

            self.main_network.fit(state, target, epochs=1, verbose=0)

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [8]:
action_space = env.action_space.n
state_space = (80, 88, 1)  # convert frame to grayscale

from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88, 80))
    image = image.convert('L')

    return np.array(image)

In [6]:
num_episodes = 1000000
num_timesteps = 400000
batch_size = 64

In [9]:
dqn = DQNAgent(state_space, action_space)

2022-03-03 05:39:15.194963: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-03 05:39:15.195001: W tensorflow/stream_executor/cuda/cuda_driver.cc:312] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-03 05:39:15.195032: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (rex-HP-EliteBook-x360-1030-G2): /proc/driver/nvidia/version does not exist
2022-03-03 05:39:15.195275: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-03 05:39:15.232001: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Fre

In [None]:
print('Satrting training')

for i in range(num_episodes):
    Return = 0
    done = False
    time_step = 0

    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80, 88, 1)

    for t in range(num_timesteps):
        env.render()
        time_step += 1

        action = dqn.act(state)

        next_state, reward, done, info = env.step(action)

        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80, 88, 1)

        dqn.store_transition(state, action, reward, next_state, done)

        state = next_state

        Return += reward
        print("Episode is {}\tTotal time Step: {}\tCurrent Reward: {}\tEpsilon is: {}".format(str(i), str(time_step), str(Return), str(dqn.epsilon)))

        clear_output(wait=True)

        if len(dqn.memory) > batch_size and i > 5:
            dqn.train(batch_size)

    dqn.update_epsilon(i)
    clear_output(wait=True)
    dqn.update_target_network()

    # save model


env.close()