In [17]:
from ple.games.flappybird import FlappyBird
from ple import PLE

import numpy as np
#from FlappyAgent import FlappyPolicy

import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

<div class="alert alert-info">
Declare functions
</div>

In [19]:
def convstate(state):
    """
    Calculate new state variables from game state
    """
    s1 = state['next_pipe_bottom_y'] - state['player_y']
    s2 = state['next_pipe_dist_to_player']
    s3 = state['player_vel']
    
    return int(s1-s1%10), int(s2-s2%20), int(s3-s3%2)

In [20]:
def epsilon_greedy(key):
    if(np.random.rand()<=epsilon): # random action
        return np.random.choice([0,1], p =[0.8,0.2])
    
    else: 
        return np.argmax(Q.get(key, [0]))

<div class="alert alert-info">
Reinit variables
</div>

In [21]:
Q = dict()

In [22]:
# Metaparameters
nb_games = 20000
alpha = 0.1 #0.7
epsilon = 0.1 #0.4
gamma = 0.9

<div class="alert alert-info">
Run training
</div>

In [23]:
# Some control variables
cumulated = np.zeros((nb_games))

# Start the game
p.init()
reward = 0

for i in range(nb_games):
    p.reset_game()
    
    # Control print
    if i%100 == 0:
        print(i, epsilon, alpha, np.mean(cumulated[i-50:i]))
        
        # Decrease exploration ratio
        epsilon = max(epsilon*0.8, 0.01)
        alpha *= 0.995
        
    if i%1000 == 999:
        np.save('Q_%d' % i ,Q)
        np.save('cumulated_%d' % i, cumulated)
    
    # 0) Retrieve initial state
    s1, s2, s3 = convstate(game.getGameState())
    current_key = str(s1)+'|'+str(s2)+'|'+str(s3)
    
    while(not p.game_over()):
        
        # 1) Choose action greedily
        a = epsilon_greedy(current_key)
        
        action = None
        if a==1:
            action = 119
      
    
        # Execute
        reward = p.act(action)
        cumulated[i] += reward
        
        ss1, ss2, ss3 = convstate(game.getGameState())
        next_key = str(ss1)+'|'+str(ss2)+'|'+str(ss3)
        

        
        # 2) Update Q value
        if Q.get(current_key) == None:
            Q[current_key] = [0,0]
            
        maxQ = max(Q.get(next_key, [0]))
            
        
        Q[current_key][a] = (1-alpha)*Q[current_key][a] + alpha*( reward + gamma*maxQ )
        
        # Update values and map key
        s1 = ss1
        s2 = ss2
        s3 = ss3
        current_key = next_key
        
        

        


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


0 0.1 0.1 nan
100 0.08000000000000002 0.0995 -5.0
200 0.06400000000000002 0.09900250000000001 -4.98
300 0.051200000000000016 0.0985074875 -4.98
400 0.04096000000000002 0.09801495006250001 -4.74
500 0.03276800000000001 0.09752487531218751 -4.74
600 0.026214400000000013 0.09703725093562657 -4.38
700 0.02097152000000001 0.09655206468094843 -3.96
800 0.016777216000000008 0.09606930435754368 -3.68
900 0.013421772800000007 0.09558895783575597 -3.56
1000 0.010737418240000006 0.09511101304657718 -3.96
1100 0.01 0.09463545798134429 -3.18
1200 0.01 0.09416228069143756 -1.8
1300 0.01 0.09369146928798038 -2.34
1400 0.01 0.09322301194154048 -2.46
1500 0.01 0.09275689688183278 -3.0
1600 0.01 0.09229311239742362 -2.14
1700 0.01 0.0918316468354365 -1.06
1800 0.01 0.09137248860125932 -1.2
1900 0.01 0.09091562615825302 -0.32
2000 0.01 0.09046104802746176 0.32
2100 0.01 0.09000874278732444 -1.48
2200 0.01 0.08955869907338782 -1.02
2300 0.01 0.08911090557802087 0.08
2400 0.01 0.08866535105013076 -1.58
250

In [None]:
Q

<div class="alert alert-info">
Postprocess
</div>

In [None]:
# Mean evolution
vallist = list()

for idx, val in enumerate(cumulated):
    if idx < 50:
        pass
    else: 
        vallist.append(np.mean( cumulated[idx-50:idx] ))

plt.plot(vallist)

vallist = list()

for idx, val in enumerate(cumulated):
    if idx < 500:
        pass
    else: 
        vallist.append(np.mean( cumulated[idx-500:idx] ))

plt.plot(vallist)

In [None]:
#c1 = cumulated
#c2 = cumulated
c3 = cumulated

In [None]:
# Mean evolution
vallist = list()

result = np.concatenate((c1,c2,c3))

for idx, val in enumerate(result):
    if idx < 50:
        pass
    else: 
        vallist.append(np.mean( result[idx-50:idx] ))

plt.plot(vallist)

vallist = list()

for idx, val in enumerate(result):
    if idx < 500:
        pass
    else: 
        vallist.append(np.mean( result[idx-500:idx] ))

plt.plot(vallist)