In [None]:
from ple.games.flappybird import FlappyBird
from ple import PLE

import numpy as np
#from FlappyAgent import FlappyPolicy

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

<div class="alert alert-info">
Declare functions
</div>

In [None]:
def convstate(state):
    """
    Calculate new state variables from game state
    """
    s1 = state['next_pipe_bottom_y'] - state['player_y']
    s2 = state['next_pipe_dist_to_player']
    s3 = state['player_vel']
    
    return int(s1-s1%10), int(s2-s2%20), int(s3-s3%2)

In [None]:
def epsilon_greedy(key):
    if(np.random.rand()<=epsilon): # random action
        return np.random.choice([0,1])
    
    else: 
        return np.argmax(Q.get(key, [0]))

In [None]:
def update_trace(key,action):
    # Update the trace
    global epsTrace
    epsTrace = { k: list(map(lambda x: x*gamma*lamb, v)) for k,v in epsTrace.items() }
    
    if epsTrace.get(key) == None:
        epsTrace[key] = [0,0]
    
    # Remember the current state
    epsTrace[key][action] = 1

In [None]:
def propagate(delta):
    for k,v in epsTrace.items():
        Q[k][0] = Q[k][0] + alpha*epsTrace[k][0]*delta
        Q[k][1] = Q[k][1] + alpha*epsTrace[k][1]*delta

<div class="alert alert-info">
Reinit variables
</div>

In [None]:
Q = dict()

In [None]:
# Metaparameters
nb_games = 18000
alpha = 0.1 #0.7
epsilon = 0.1 #0.4
gamma = 0.9
lamb = 1

<div class="alert alert-info">
Run training
</div>

In [None]:
nbgames = 18000

# Some control variables
cumulated = np.zeros((nb_games))

# Start the game
p.init()
reward = 0

for i in range(nb_games):
    p.reset_game()
    epsTrace = dict()
    
    # Control print
    if i%100 == 0:
        print(i, epsilon, alpha, np.mean(cumulated[i-50:i]))
        
        # Decrease exploration ratio
        epsilon = max(epsilon * 0.95, 0.01)
        alpha *= 0.995
        
    if i%1000 == 999:
        np.save('Qsarsa_more_%d' % i ,Q)
        np.save('cumulated_sarsa_more_%d' % i, cumulated)
    
    # 0) Retrieve initial state 
    s1, s2, s3 = convstate(game.getGameState())
    current_key = str(s1)+'|'+str(s2)+'|'+str(s3)
    
    if Q.get(current_key) == None:
        Q[current_key] = [0,0]
    
    # Choose action greedily
    a = epsilon_greedy(current_key)
    
    while(not p.game_over()):
        
        # Translate action
        action = None
        if a==1:
            action = 119
      
        # 1) Execute
        reward = p.act(action)
        cumulated[i] += reward
        
        ss1, ss2, ss3 = convstate(game.getGameState())
        next_key = str(ss1)+'|'+str(ss2)+'|'+str(ss3)

        # 2) Choose new action greedily
        aa = epsilon_greedy(next_key)
        
        # 3) Update Q value
        # Update trace
        update_trace(current_key, a)
        
        # Update Q
        if Q.get(next_key) == None:
            Q[next_key] = [0,0]
        
        delta = reward + gamma*Q[next_key][aa] - Q[current_key][a]
        propagate(delta)
        
        # Update values and map key
        s1 = ss1
        s2 = ss2
        s3 = ss3
        a = aa
        current_key = next_key   


<div class="alert alert-info">
Postprocess
</div>

In [None]:
# Mean evolution
vallist = list()

for idx, val in enumerate(cumulated):
    if idx < 50:
        pass
    else: 
        vallist.append(np.mean( cumulated[idx-50:idx] ))

plt.plot(vallist)

vallist = list()

for idx, val in enumerate(cumulated):
    if idx < 500:
        pass
    else: 
        vallist.append(np.mean( cumulated[idx-500:idx] ))

plt.plot(vallist)