In [None]:
from ple.games.flappybird import FlappyBird
from ple import PLE

import numpy as np
#from FlappyAgent import FlappyPolicy

import matplotlib.pyplot as plt
#%matplotlib inline

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop, sgd, Adam
from keras.layers.recurrent import LSTM
import numpy as np
import random
import h5py
from IPython.display import clear_output
from collections import deque

In [None]:
game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

<div class="alert alert-info">
Declare functions
</div>

In [None]:
def convstate(state):
    """
    Calculate new state variables from game state
    """
    s = np.zeros((3))
    s[0] = state['next_pipe_bottom_y'] - state['player_y']
    s[1] = state['next_pipe_dist_to_player']
    s[2] = state['player_vel']
    
    s[0] = (s[0] - (210 - 40)/2) / ((210 + 40)/2)
    s[1] = (s[1] - (420 - 420)/2) / ((420 + 420)/2) 
    s[2] = (s[2] - (10 - 10)/2) / ((10 + 10)/2)
    
    return s.reshape((1,3))

In [None]:
def epsilon_greedy(s):
    
    if(np.random.rand()<=epsilon): # random action
        return np.random.choice([0,1], p=[0.9,.1])
    
    else: 
        qval = model.predict(s)
        return np.argmax(qval)

In [None]:
class ReplayMemory:
    """
    self.memory contains the old state, the action, the reward, the new state and wether it is a final status, 
    concatenated in an array.
    """
    def __init__ (self, size):
        self.size = size
        self.index = 0
        self.currentsize = 0
        self.memory = np.zeros((size,9))
        
    def insert (self, state):
        if self.currentsize < self.size:
            self.currentsize += 1
        self.memory[self.index,:] = state[:]
        self.index += 1
        self.index = self.index % self.size
        
    def sample (self, batchSize):
        batchSize = min(self.currentsize, batchSize)
        ind = np.random.choice(self.currentsize, size=batchSize, replace=False)
        return self.memory[ind,:]
    

<div class="alert alert-info">
Declare model
</div>

In [None]:
model = Sequential()

model.add(Dense(100, kernel_initializer='lecun_uniform', input_shape=(3,)))
model.add(Activation('relu'))
#model.add(Dropout(0.5)) 
model.add(Dense(100, kernel_initializer='lecun_uniform'))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(2, kernel_initializer='lecun_uniform'))
model.add(Activation('linear'))
#model.compile(loss='mse', optimizer="rmsprop")
adam = Adam(lr=1e-2)
model.compile(loss='mse', optimizer=adam)

model.summary()

<div class="alert alert-info">
Hyperparameters
</div>

In [None]:
nb_games = 1000
gamma = .99 # discount factor
epsilon = .1 # epsilon-greddy
batchSize = 32
replay = ReplayMemory(10000)
replay_pos = ReplayMemory(10000)

<div class="alert alert-info">
Train network
</div>

In [None]:
# Some control variables
cumulated = np.zeros((nb_games))

# Start the game
p.init()
r = 0
step = 0

for i in range(nb_games):
    p.reset_game()
    
    # Control print
    if i%100 == 0:
        print(i, epsilon, np.mean(cumulated[i-50:i]))
        
        # Decrease exploration ratio
        epsilon *= 0.98
    
    # 0) Retrieve initial state
    
    s = convstate(game.getGameState())
    
    while(not p.game_over()):
        
        # 1) Choose action greedily
        a = epsilon_greedy(s)
        action = 119 if a else None
        
        # Execute 
        r = p.act(action)
        cumulated[i] += r
        
        clipped_r = max( min( r, 1 ), -1 ) # Clip the reward values
       
        ss = convstate(game.getGameState())

        replay.insert(np.concatenate((s,[[a]],[[r]],ss,[[p.game_over()]]),axis=1))
                
        # 2) Update Q 
        
        if step > 1000: # and step % 100 == 99:
        
            train_x = np.zeros((batchSize,3))
            train_y = np.zeros((batchSize,2))
            for idx,entry in enumerate(replay.sample(batchSize)):
                currentS = entry[0:3].copy().reshape(1,3)
                nextS = entry[5:8].copy().reshape(1,3)
                act = entry[3]
                rew = entry[4]
                ending = entry[8]

                currentQ = model.predict(currentS)
                nextQmax = np.max(model.predict(nextS))
                currentQ[0][a] = rew + gamma * nextQmax * (1-ending)

                train_x[idx,:] = currentS[0,:]
                train_y[idx,:] = currentQ[0,:]

            model.fit(train_x, train_y, batch_size=1, nb_epoch=1, verbose=0)
            
        
        # 3) Redeclare state
        s = ss
        
        step += 1
        