In [9]:
import numpy as np
import matplotlib.pyplot as plt
from ple import PLE
from ple.games.pixelcopter import Pixelcopter
import pandas as pd


class NaiveAgent():

    def __init__(self, actions):
        self.actions = actions

    def pickAction(self, reward, obs):
        return self.actions[np.random.randint(0, len(self.actions))]

In [179]:
class LearningAgent:
    def __init__(self):

        self.num_bins = 3
        
        self.boundaries = {
            'player_vel': [-1, 1],
            'player_dist_to_floor': [35, 40], 
            'player_dist_to_ceil': [10, 15], 
            'next_gate_dist_to_player': [5, 35]   
        }
            
        
        self.discrete_states = {
            'player_vel': np.linspace(self.boundaries['player_vel'][0], self.boundaries['player_vel'][1], 3),
            'player_dist_to_floor': np.linspace(self.boundaries['player_dist_to_floor'][0],
                                                self.boundaries['player_dist_to_floor'][1], 3),
            'player_dist_to_ceil': np.linspace(self.boundaries['player_dist_to_ceil'][0],
                                                self.boundaries['player_dist_to_ceil'][1], 3),
            'next_gate_dist_to_player': np.linspace(self.boundaries['player_dist_to_ceil'][0],
                                                self.boundaries['player_dist_to_ceil'][1], 3)
            }

        self.q = self.discretize_states()

    def discretize_states(self):
        q_table = {}
        for pv in range(len(self.discrete_states['player_vel'])+1):
            for df in range(len(self.discrete_states['player_dist_to_floor'])+1):
                for dc in range(len(self.discrete_states['player_dist_to_ceil'])+1):
                    for db in range(len(self.discrete_states['next_gate_dist_to_player'])+1):
                        for bl in range(2):
                            q_table[hash((pv,df,dc,db,bl))] = {199: 0, None: 0, 'counter199': 0, 'counterNone': 0} 
        
        return q_table 
    
    def build_state(self, observation):
        
        player_vel = int(np.digitize(observation['player_vel'], self.discrete_states['player_vel']))
        player_dist_to_floor = int(np.digitize(observation['player_dist_to_floor'], self.discrete_states['player_dist_to_floor']))
        player_dist_to_ceil = int(np.digitize(observation['player_dist_to_ceil'], self.discrete_states['player_dist_to_ceil']))
        next_gate_dist_to_player = int(np.digitize(observation['next_gate_dist_to_player'], self.discrete_states['next_gate_dist_to_player']))
        
        if observation['next_gate_block_bottom']>observation['next_gate_block_top']:
            next_gate_block_loc = 1
        else:
            next_gate_block_loc = 0
        
        state = hash((player_vel, player_dist_to_floor, player_dist_to_ceil, 
                     next_gate_dist_to_player, next_gate_block_loc))
                        
        return state
    

    def pickAction(self, observation):
        state = self.build_state(observation)
        
        #action = 199 if (dict1[8688455073536272099][199] > dict1[8688455073536272099][None]) else None
        action = [199, None][np.random.randint(0, 2)]
        self.q[state]['counter'+str(action)] += 1
        
        return action
    
    
    def update_values(self, observation, reward, observation_next, action):

        state = self.build_state(observation)
        state_next = self.build_state(observation_next)
        
        learning_rate = 1 / self.q[state]['counter'+str(action)]
        self.q[state][action] = self.q[state][action] + learning_rate * (reward + 
                            1*self.q[state_next][action] - self.q[state][action]) 
        return

In [180]:
agent.discrete_states

{'player_vel': array([-1.,  0.,  1.]),
 'player_dist_to_floor': array([35. , 37.5, 40. ]),
 'player_dist_to_ceil': array([10. , 12.5, 15. ]),
 'next_gate_dist_to_player': array([10. , 12.5, 15. ])}

In [186]:
n_episodes = 10

game = Pixelcopter(width=100, height=100)

p = PLE(game, fps=30, frame_skip=2, num_steps=1,
        force_fps=False, display_screen=False)

agent = LearningAgent()

In [None]:
results = pd.DataFrame()
actions = []
for episode_index in range(n_episodes):
    #print('Eposode %4d running...' % episode_index)
    p.reset_game()

        
    for timestep_index in range(100):
       
        observation = game.getGameState()
        action = agent.pickAction(observation)
        reward = p.act(action)
        
        if reward < 0:
            reward = -5
        
        observation_next = game.getGameState()
        agent.update_values(observation, reward, observation_next, action)

        observation['action'] = action
        observation['reward'] = reward
        
        results = results.append(pd.DataFrame(observation, index=[episode_index]))
        
        
        done = p.game_over() 
        if done:
            print('Episode %4d result: %3d timesteps' % (episode_index, timestep_index))
            break
 

Episode    0 result:   6 timesteps
Episode    1 result:   9 timesteps
Episode    2 result:  11 timesteps
Episode    3 result:   8 timesteps
Episode    4 result:  10 timesteps
Episode    5 result:   9 timesteps
Episode    6 result:  11 timesteps
Episode    7 result:  10 timesteps
Episode    8 result:  10 timesteps
Episode    9 result:  10 timesteps
Episode   10 result:   8 timesteps
Episode   11 result:  10 timesteps
Episode   12 result:  11 timesteps
Episode   13 result:   8 timesteps
Episode   14 result:  11 timesteps
Episode   15 result:  11 timesteps
Episode   16 result:   9 timesteps
Episode   17 result:   8 timesteps
Episode   18 result:  10 timesteps


In [185]:
results