In [1]:
#import the libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras import optimizers
from keras.models import load_model
from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize
from ple import PLE
from ple.games.flappybird import FlappyBird
import matplotlib.pyplot as plt
import timeit

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


couldn't import doomish
Couldn't import doom


In [7]:
#defining reward model
def clip_reward(rew):
    #reward policy
    rr=0
    if rew>0:
        rr=1
    if rew<0:
        rr=-3
    return rr
    
def greedy_action(convnet, x):
    #netword action
    QX = convnet.predict(np.array([x]))
    return np.argmax(QX)

def process_screen(x):
    #processing screen
    x = x[50:270, :320]
    return 256*resize(rgb2gray(x), (80, 80))

def epsilon(step):
    #epsilon evolution
    if step<1e6:
        return 1.-step*9e-7
    return .0001


In [8]:
# A class for the replay memory
class MemoryBuffer:
    def __init__(self, length, screen_shape, action_shape):
        self.length = length
        self.screen_shape = screen_shape
        self.action_shape = action_shape
        shape = (length,) + screen_shape
        self.screens_x = np.zeros(shape, dtype=np.uint8) # starting states
        self.screens_y = np.zeros(shape, dtype=np.uint8) # resulting states
        shape = (length,) + action_shape
        self.actions = np.zeros(shape, dtype=np.uint8) # actions
        self.rewards = np.zeros((length,1), dtype=np.int8) # rewards
        self.terminals = np.zeros((length,1), dtype=np.bool) # true if resulting state is terminal
        self.terminals[-1] = True
        self.index = 0 # points one position past the last inserted element
        self.size = 0 # current size of the buffer
    
    def append(self, screenx, a, r, screeny, d):
        self.screens_x[self.index] = screenx
        #plt.imshow(screenx)
        #plt.show()
        #plt.imshow(self.screens_x[self.index])
        #plt.show()
        self.actions[self.index] = a
        self.rewards[self.index] = r
        self.screens_y[self.index] = screeny
        self.terminals[self.index] = d
        self.index = (self.index+1) % self.length
        self.size = np.min([self.size+1,self.length])
    
    def stacked_frames_x(self, index):
        im_deque = deque(maxlen=4)
        pos = index % self.length
        for i in range(4): # todo
            im = self.screens_x[pos]
            im_deque.appendleft(im)
            test_pos = (pos-1) % self.length
            if self.terminals[test_pos] == False:
                pos = test_pos
        return np.stack(im_deque, axis=-1)
    
    def stacked_frames_y(self, index):
        im_deque = deque(maxlen=4)
        pos = index % self.length
        for i in range(4): # todo
            im = self.screens_y[pos]
            im_deque.appendleft(im)
            test_pos = (pos-1) % self.length
            if self.terminals[test_pos] == False:
                pos = test_pos
        return np.stack(im_deque, axis=-1)
    
    def minibatch(self, size):
        #return np.random.choice(self.data[:self.size], size=sz, replace=False)
        indices = np.random.choice(self.size, size=size, replace=False)
        x = np.zeros((size,)+self.screen_shape+(4,))
        y = np.zeros((size,)+self.screen_shape+(4,))
        for i in range(size):
            x[i] = self.stacked_frames_x(indices[i])
            y[i] = self.stacked_frames_y(indices[i])
        return x, self.actions[indices], self.rewards[indices], y, self.terminals[indices]

In [9]:
'''
# Creating model

# Convolutional model for keras
dqn = Sequential()
#1st layer
dqn.add(Conv2D(filters=16, kernel_size=(8,8), strides=4, activation="relu", input_shape=(80, 80,4)))
#2nd layer
dqn.add(Conv2D(filters=32, kernel_size=(4,4), strides=2, activation="relu"))
dqn.add(Flatten())
#3rd layer
dqn.add(Dense(units=256, activation="relu"))
#output layer
dqn.add(Dense(units=2, activation="linear"))

dqn.compile(optimizer = "rmsprop", loss = "mean_squared_error")
adam = optimizers.Adam(lr = 1e-4)
dqn.compile(loss = "mean_squared_error", optimizer = adam)
dqn.save('dqn_3.h5')
dqn_target = load_model('dqn_3.h5') 
'''

#Load keras network
dqn = load_model('dqn_3.h5')
adam = optimizers.Adam(lr = 1e-4)
dqn.compile(loss = "mean_squared_error", optimizer = adam)
dqn.save('dqn_3.h5')
dqn_target = load_model('dqn_3.h5') 

In [18]:
#One-time simulation parameters (run only the first time)
replay_memory_size = 10000
replay_memory = MemoryBuffer(replay_memory_size, (80, 80), (1,))
step = 0
w_transfer = 5000
mini_batch_size = 32
gamma = 0.99

In [19]:
test = 0 #0 -> executes only greedy-action
nb_games = 5 #number of games to be played

#Flappy configuration
game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen = True)
#flappy start
p.init()
p.reset_game()
#get possible actions for the player
actions = p.getActionSet()
#process the screen
screen_x = process_screen(p.getScreenRGB())
#stocks and fills the stacks
stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
x = np.stack(stacked_x, axis=-1)
start = timeit.default_timer()
#initialization of vectors
cumulated = np.zeros((nb_games))
cumulated_art = np.zeros((nb_games))
#main loop
for i in range(nb_games):
    p.reset_game()
    screen_x = process_screen(p.getScreenRGB())
    stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
    x = np.stack(stacked_x, axis=-1) 
    while(not p.game_over()): #while alive
        step = step+1 #step evolution
        #show nb of games played
        if ((i+1)%100 == 0):
            print('Jeu',i+1)
            
        #action selection
        if np.random.rand() < test*epsilon(step):
            a = np.random.randint(2)
        else:
            a = greedy_action(dqn, x)
        #game reward
        reward = p.act(actions[a])
        #policy reward
        r = clip_reward(reward)
        #next screen
        screen_y = process_screen(p.getScreenRGB())
        d = p.game_over() #dead or alive???
        replay_memory.append(screen_x, a, r, screen_y, d)
        # train
        if step > mini_batch_size:
            X,A,R,Y,D = replay_memory.minibatch(mini_batch_size)
            QY = dqn_target.predict(Y)
            QYmax = QY.max(1).reshape((mini_batch_size,1))
            update = R + gamma * (1-D) * QYmax
            QX = dqn.predict(X)
            QX[np.arange(mini_batch_size), A.ravel()] = update.ravel()
            dqn.train_on_batch(x=X, y=QX)

        # transfert weights between networks
        if step > 1 and step % w_transfer == 0:
            print('saving')
            dqn.save('dqn_3.h5')
            print("Saving done")
            dqn_target = load_model('dqn_3.h5')
        #socre
        cumulated[i] = cumulated[i] + reward
        cumulated_art[i] =  cumulated_art[i] + r
        # keep going
        screen_x = screen_y
        stacked_x.append(screen_x)
        x = np.stack(stacked_x, axis=-1)

#total time played
stop = timeit.default_timer()
temps =stop - start

print ('temps [s]',(temps))

print('saving')
dqn.save('dqn_3.h5')
print('Saving done')
print('fini ^^')

KeyboardInterrupt: 