In [1]:
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import cv2
import tensorflow

In [2]:
import os
import keras.layers
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.callbacks import TensorBoard
from keras.optimizers import adam_v2
from collections import deque
import numpy as np
import random
import base64
from tqdm import tqdm

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [6]:
from gym.wrappers import FrameStack, GrayScaleObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from matplotlib import pyplot as plt

In [7]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env,SIMPLE_MOVEMENT)
env = GrayScaleObservation(env,keep_dim = True)


In [8]:
input_shape = env.observation_space.shape

In [9]:
action_size = env.action_space.n

In [60]:
REPLAY_MEMORY_SIZE = 1001
MIN_REPLAY_MEMORY_SIZE = 1000
MINIBATCH_SIZE = 64
TARGET_STEP = 5
EPISODES = 2000
gamma = 0.99
output_dir = 'model-output/superMario'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.9995
MIN_EPSILON = 0.001
MIN_REWARD = -200


In [66]:
class Agent:

    def __init__(self, input_shape, action_space_size):

        self.input_shape = input_shape
        self.action_space_size = action_space_size
        self.target_update_counter = 0
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        
        self.model = self.create_model()
        self.target_model = self.create_model()

        self.target_model.set_weights(self.model.get_weights())


    def create_model(self):
        model = Sequential()
        model.add(keras.layers.Input(shape = self.input_shape))
        model.add(Conv2D(256, (5, 5)))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))

        model.add(Conv2D(128, (3, 3)))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(2, 2))
        model.add(Dropout(0.2))

        model.add(Flatten())
        model.add(Dense(32,activation='linear'))
        model.add(Dense(self.action_space_size,activation="linear"))
        model.compile(loss = "mse", optimizer=adam_v2.Adam(learning_rate =0.001))

        return model

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def get_qs(self, state):
        state_norm = np.array(state)
        state_norm = state_norm/255
        preds = state_norm.reshape(-1, state.shape[0],state.shape[1],state.shape[2])
        return self.model.predict(preds)[0]
    
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)
    
        

    def train(self,terminal_state,step):
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return
        
        minibatch = random.sample(self.replay_memory,MINIBATCH_SIZE)
        curr_states = np.array([transition[0]/255 for transition in minibatch])
        curr_qs_val = self.model.predict(curr_states)

        new_curr_states = np.array([transition[3]/255 for transition in minibatch])
        future_qs_vals = self.target_model.predict(new_curr_states)

        x = []
        y = []

        for idx, (curr_state, action, reward, new_curr_state, done) in enumerate(minibatch):
            if not done:
                max_future_q = np.max(future_qs_vals[idx])
                new_q = reward + gamma * max_future_q
            else:
                new_q = reward

            curr_qs = curr_qs_val[idx]
            curr_qs[action] = new_q

            x.append(curr_state)
            y.append(curr_qs)

        
        self.model.fit(np.array(x),np.array(y),batch_size=8,verbose=0,shuffle= False)
        
        if terminal_state:
            self.target_update_counter += 1
        if self.target_update_counter > TARGET_STEP:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

In [59]:
for ep in range(10):
    state = env.reset()
    cul = 0
    done = False
    while not done:
        env.render()
        if np.random.random() > epsilon:
                action = np.argmax(agent.get_qs(state))
        else:
                action = np.random.randint(0, agent.action_space_size)
                
        new_state, reward, done,_ = env.step(action)
        cul+=reward
        state = new_state
        if done:
                print("episode: {}/{}, score: {:.2}".format(episode,EPISODES,episode_reward))
                




2022-07-04 16:20:20.154117: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 

In [67]:
def StartTrain():
    
    for episode in range(EPISODES):
        episode_reward = 0
        step = 1
        curr_state = env.reset()
        done = False
        
        global epsilon
        
        while not done:
            env.render()
            if np.random.random() > epsilon:
                action = np.argmax(agent.get_qs(curr_state))
            else:
                action = np.random.randint(0, agent.action_space_size)

            new_state, reward, done,_ = env.step(action)
            
            reward = reward if not done else -20
            
            episode_reward += reward

            agent.update_replay_memory((curr_state, action, reward, new_state, done))
            agent.train(done, step)
            
            curr_state = new_state
            step += 1

            #ep_rewards.append(episode_reward)
            if done:
                print("episode: {}/{}, score: {:.2}".format(episode,EPISODES,episode_reward))

            if episode % 50 == 0:
                agent.save(output_dir + "weights_" + '{:04d}'.format(episode) + ".hdf5")
            if epsilon > MIN_EPSILON:
                epsilon *= EPSILON_DECAY
                epsilon = max(MIN_EPSILON, epsilon)
                
                
                

In [68]:
input_shape = env.observation_space.shape
action_size = env.action_space.n

In [69]:
agent = Agent(input_shape,action_size)

In [70]:
len(agent.replay_memory)

0

In [71]:

StartTrain()
env.close()



2022-07-04 16:26:46.057907: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-07-04 16:27:38.066440: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-07-04 16:27:40.380412: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 