In [1]:
from collections import deque

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import cv2 as cv

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import Adam

import numpy as np
import pandas as pd
import random
import pickle
import time
import os
import base64
from io import BytesIO
from PIL import Image
from marshmallow import Schema, fields, post_load

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
GAME_URL = 'file:///Users/shuweizhang/Documents/Studies_Local/527/CrossyRoad/web/index.html'

OBJ_PATH = '/Users/shuweizhang/Documents/Studies_Local/527/CrossyRoad/objects/'

MODEL_FILE_PATH = OBJ_PATH + 'model.h5'

LOSS_FILE_PATH = OBJ_PATH + 'loss_df.csv'
ACTIONS_FILE_PATH = OBJ_PATH +'actions_df.csv'
Q_VALUES_FILE_PATH = OBJ_PATH + 'q_values.csv'
SCORES_FILE_PATH = OBJ_PATH + 'scores_df.csv'

# Set the browser size
WINDOW_SIZE_W = 1200
WINDOW_SIZE_H = 600

In [3]:
ACTIONS = 4
LEARNING_RATE = 1e-4
IMG_ROWS, IMG_COLS = 80, 80
IMG_CHANNELS = 4
EXPLORE = 100000
INITIAL_EPSILON = 0.2
FINAL_EPSILON = 0.001
OBSERVE = 10000
REPLAY_MEMORY = 50000
BATCH = 32
GAMMA = 0.99

In [6]:
def save_obj(obj, name):
    with open(OBJ_PATH + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(OBJ_PATH + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
def init_cache():
    if not os.path.isfile('time.pkl'):
        initial_epsilon = 0.4
        save_obj(initial_epsilon, 'epsilon')
        t = 0
        save_obj(t, 'time')
        memo = deque()
        save_obj(memo, "memory")
        
init_cache()

In [16]:
class Game:
    def __init__(self, on_cloud=True):
        chrome_options = Options()
#         chrome_options.add_argument('--no-sandbox')
#         chrome_options.add_argument('--headless')
#         chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        self.driver.set_window_size(WINDOW_SIZE_W, WINDOW_SIZE_H)
        self.driver.get(GAME_URL)
        self._paused = False
        self.element = self.driver.find_element_by_id('retry')

    def forward(self):
        self.driver.find_element_by_id("forward").send_keys(Keys.UP)

    def backward(self):
        self.driver.find_element_by_id("backward").send_keys(Keys.DOWN)

    def left(self):
        self.driver.find_element_by_id('left').send_keys(Keys.LEFT)

    def right(self):
        self.driver.find_element_by_id('right').send_keys(Keys.RIGHT)

    def get_score(self):
        text_score = self.driver.find_element_by_id('counter').text
        return int(text_score)

    def get_crashed(self):
        retry_element = self.driver.find_element_by_id('retry')
        return retry_element.is_displayed()

    def restart_game(self):
        WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.ID, "retry"))).click()

    def end(self):
        self.driver.close()

    def pause_or_resume(self):
        action = ActionChains(self.driver)
        action.send_keys(Keys.ENTER).perform()
        self._paused = not self._paused

    def pause(self):
        if not self._paused:
            self.pause_or_resume()

    def resume(self):
        if self._paused:
            self.pause_or_resume()

    def get_paused(self):
        return self._paused

    def end_game(self):
        self.driver.close()

In [17]:
class Agent_Setting:
    def __init__(self,
                 agent_id,
                 score_dep=True,
                 delta_score=True,
                 move_list=[0.1, 0.1, 0.1, 0.1],
                 reward_weights=[1, 1, 1, 1],
                 dead_punishment=-10,
                 resize_w=80,
                 resize_h=80,
                 canny_th1=100,
                 canny_th2=200,
                 learning_rate=1e-4,
                 initial_epsilon=0.2,
                 final_epsilon=0.001,
                 observe=10000,
                 explore=10000,
                 replay_memory=5000,
                 batch=32,
                 gamma=0.99):
        self.agent_id = agent_id
        self.score_dep = score_dep
        self.delta_score = delta_score
        self.move_list = move_list
        self.reward_weights = reward_weights
        self.dead_punishment = dead_punishment
        self.resize_w = resize_w
        self.resize_h = resize_h
        self.canny_th1 = canny_th1
        self.canny_th2 = canny_th2
        self.learning_rate = learning_rate
        self.initial_epsilon = initial_epsilon
        self.final_epsilon = final_epsilon
        self.observe = observe
        self.explore = explore
        self.replay_memory = replay_memory
        self.batch = batch
        self.gamma = gamma

In [18]:
class Game_State:
    def __init__(self, game, reward_setting):
        self.game = game
        self.reward_setting = reward_setting

    def processing_img(self):
        img_64 = self.game.driver.find_element_by_id('imgURL').text
        screen = np.array(Image.open(BytesIO(base64.b64decode(img_64))))
        img = cv.cvtColor(screen, cv.COLOR_RGBA2GRAY)
        img = img[200:400, 650:850]
        img = cv.resize(img, (self.reward_setting.resize_w, self.reward_setting.resize_h))
        img = cv.Canny(img,
                       threshold1=self.reward_setting.canny_th1,
                       threshold2=self.reward_setting.canny_th2)

        return img

    def get_reward(self, action, is_dead, old_score):
        if action == -1:
            return 0

        if is_dead:
            return self.reward_setting.dead_punishment

        if not self.reward_setting.score_dep:
            return self.reward_setting.move_list[action]

        score = self.game.get_score()

        if self.reward_setting.delta_score:
            score -= old_score

        return score * self.reward_setting.reward_weights[action]

    def get_state(self, action):
        old_score = self.game.get_score()

        if action == 0:
            self.game.forward()
        elif action == 1:
            self.game.left()
        elif action == 2:
            self.game.right()
        elif action == 3:
            self.game.backward()

        image = self.processing_img()
        is_dead = self.game.get_crashed()
        new_score = self.game.get_score()
        reward = self.get_reward(action, is_dead, old_score)

        if is_dead:
            self.game.restart_game()

        return image, reward, is_dead, new_score

In [19]:
def build_model():
    model = Sequential()

    model.add(Conv2D(32, (8, 8), padding='same', strides=(4, 4), input_shape=(IMG_COLS, IMG_ROWS, IMG_CHANNELS)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))

    model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))

    model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Activation('relu'))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))

    model.add(Dense(ACTIONS))

    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse', optimizer=adam)

    if not os.path.isfile(MODEL_FILE_PATH):
        model.save_weights(MODEL_FILE_PATH)
    return model

In [23]:
class Task:
    def __init__(self):
        self.loss_df = pd.read_csv(LOSS_FILE_PATH) if os.path.isfile(LOSS_FILE_PATH) else pd.DataFrame(columns=['loss'])
        self.q_values_df = pd.read_csv(Q_VALUES_FILE_PATH) if os.path.isfile(Q_VALUES_FILE_PATH) else pd.DataFrame(
            columns=['qvalues'])
        self.game_state = Game_State(Game(), Agent_Setting(1))
        self.model = build_model()

    def train_nn(self):
        init_cache()
        memo = load_obj('memory')
        do_noting = -1
        t = load_obj('time')
        epsilon = load_obj('epsilon')

        if os.path.isfile(MODEL_FILE_PATH):
            self.model.load_weights(MODEL_FILE_PATH)
            adam = Adam(learning_rate=self.game_state.reward_setting.learning_rate)
            self.model.compile(loss='mse', optimizer=adam)

        image_t, reward_0, is_dead, game_score = self.game_state.get_state(do_noting)
        state_t = np.stack((image_t, image_t, image_t, image_t), axis=2)
        state_t = state_t.reshape(1, state_t.shape[0], state_t.shape[1], state_t.shape[2])

        while True:
            loss = 0
            Q_sa = 0
            reward_t = 0
            action_t = do_noting

            self.game_state.game.pause()

            if t < self.game_state.reward_setting.observe:
                action_t = random.randrange(ACTIONS)
            else:
                pred = self.model.predict(state_t)
                action_t = np.argmax(pred)

            self.game_state.game.resume()

            if epsilon > self.game_state.reward_setting.final_epsilon and t > self.game_state.reward_setting.observe:
                epsilon -= \
                    (self.game_state.reward_setting.initial_epsilon - self.game_state.reward_setting.final_epsilon) \
                    / self.game_state.reward_setting.explore

            image_t1, reward_t1, is_dead, game_score = self.game_state.get_state(action_t)
            image_t1 = image_t1.reshape(1, image_t1.shape[0], image_t1.shape[1], 1)
            state_t1 = np.append(image_t1, state_t[:, :, :, :3], axis=3)

            self.game_state.game.pause()

            if t > self.game_state.reward_setting.observe:
                minibatch = random.sample(memo, self.game_state.reward_setting.batch)
                inputs = np.zeros(
                    (self.game_state.reward_setting.batch, state_t.shape[1], state_t.shape[2], state_t.shape[3]))
                targets = np.zeros((inputs.shape[0], ACTIONS))

                for i in range(len(minibatch)):
                    state_t = minibatch[i][0]
                    action_t = minibatch[i][1]
                    reward_t = minibatch[i][2]
                    state_t1 = minibatch[i][3]
                    is_dead = minibatch[i][4]

                    inputs[i:i+1] = state_t
                    targets[i] = self.model.predict(state_t)
                    Q_sa = self.model.predict(state_t1)

                    if is_dead:
                        targets[i, action_t] = reward_t
                    else:
                        targets[i, action_t] = reward_t + self.game_state.reward_setting.gamma * np.max(Q_sa)

                loss += self.model.train_on_batch(inputs, targets)
                self.loss_df.loc[len(self.loss_df)] = loss
                self.q_values_df.loc[len(self.q_values_df)] = np.max(Q_sa)

            state_t = state_t1
            reward_t = reward_t1
            t += 1

            if t % 200 == 0:
                print('Now we save model')
                self.model.save_weights(MODEL_FILE_PATH, overwrite=True)
                save_obj(memo, 'memory')
                save_obj(t, 'time')
                save_obj(epsilon, 'epsilon')
                self.loss_df.to_csv(LOSS_FILE_PATH, index=False)
                self.q_values_df.to_csv(Q_VALUES_FILE_PATH, index=False)
                print('Finished Saving')

            state = 'train'
            if t <= self.game_state.reward_setting.observe:
                state = 'observe'
            elif self.game_state.reward_setting.observe < t \
                    <= self.game_state.reward_setting.observe + self.game_state.reward_setting.explore:
                state = 'explore'

            print("TIMESTEP", t,
                  "/ STATE", state,
                  "/ EPSILON", epsilon,
                  "/ ACTION", action_t,
                  "/ REWARD", reward_t,
                  "/ SCORE", game_score,
                  "/ Q_MAX ", np.max(Q_sa),
                  "/ Loss ", loss,
                  "/ Is Dead", is_dead)

            self.game_state.game.resume()

In [24]:
task = Task()

  import sys


In [25]:
task.train_nn()

TIMESTEP 1 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 1 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 2 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 2 / SCORE 6 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 3 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 0 / SCORE 6 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 4 / STATE observe / EPSILON 0.4 / ACTION 3 / REWARD -1 / SCORE 5 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 5 / STATE observe / EPSILON 0.4 / ACTION 1 / REWARD -10 / SCORE 5 / Q_MAX  0 / Loss  0 / Is Dead True
TIMESTEP 6 / STATE observe / EPSILON 0.4 / ACTION 2 / REWARD 0 / SCORE 0 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 7 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 1 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 8 / STATE observe / EPSILON 0.4 / ACTION 2 / REWARD 0 / SCORE 3 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 9 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD -10 / SCORE 3 / Q_MAX  0 / Loss  0 / Is Dead True
TIMESTE

TIMESTEP 75 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD -10 / SCORE 6 / Q_MAX  0 / Loss  0 / Is Dead True
TIMESTEP 76 / STATE observe / EPSILON 0.4 / ACTION 1 / REWARD 0 / SCORE 0 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 77 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 1 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 78 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD -10 / SCORE 4 / Q_MAX  0 / Loss  0 / Is Dead True
TIMESTEP 79 / STATE observe / EPSILON 0.4 / ACTION 1 / REWARD 0 / SCORE 0 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 80 / STATE observe / EPSILON 0.4 / ACTION 2 / REWARD 0 / SCORE 0 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 81 / STATE observe / EPSILON 0.4 / ACTION 0 / REWARD 1 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 82 / STATE observe / EPSILON 0.4 / ACTION 2 / REWARD 0 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead False
TIMESTEP 83 / STATE observe / EPSILON 0.4 / ACTION 1 / REWARD -10 / SCORE 1 / Q_MAX  0 / Loss  0 / Is Dead Tru

KeyboardInterrupt: 