In [5]:
%load_ext autoreload
%autoreload 2

import os, sys, random
path, _ = os.path.split(os.getcwd())
sys.path.append(path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

def sxs(images):
    plt.figure(figsize=(20, 20))
    for i in range(len(images)):
        images[i] = np.rot90(images[i])
        plt.subplot(1, len(images), i+1)
        plt.imshow(images[i], cmap='gray')

    plt.tight_layout()
    plt.show()

In [3]:
import numpy as np
import pandas as pd

In [8]:
# Set up environment

from game import AirHockey
env = AirHockey()

In [10]:
env.observe()

{'puck': (100, 100),
 'player': (350, 240),
 'opponent': (550, 240),
 'reward': -10}

In [20]:
env.update_state(action=(300, 300))

In [11]:
class Player(object):
    
    def __init__(self, env=None):
        """ Initialize a player for a game """
        
        if env is not None:
            self._env = env
        else:
            from game import AirHockey
            self._env = AirHockey()
            
    def move(self, action):
        " Move player "
        self._env.update_state(action)
        
    def location(self):
        return self._env.left_mallet.x, self._env.left_mallet.y
    
    def env(self):
        return self._env

In [12]:
player = Player(env)

In [35]:
action = np.zeros(shape=(env.table_midpoints[0], env.table_size[1]))

In [31]:
np.argwhere(action == 1)[0][0]

399

In [53]:
def make_action(position):
    """ Convert position into state space """
    
    action = np.zeros(shape=(env.table_midpoints[0], env.table_size[1]))
    action[position[0], position[1]] = 1
    return action

In [36]:
action[399, 200] = 1

In [41]:
position = (399, 200)

In [47]:
action = create_state(position)

In [51]:
np.unique(action)

array([0., 1.])

In [48]:
player.move(action=action)

In [49]:
player.location()

(399, 200)

In [37]:
player.env()

<game.AirHockey at 0x7f42cf70ce10>

### Explore basic Q Learner
https://github.com/srome/blackjacklearner/blob/master/app/qlearner.py

In [27]:
actions = ["U", "D", "L", "R"]

    
class Learner(Player):
        
    def __init__(self, env):
        super().__init__(env)
        self._Q = {}
        self._last_state = None
        self._last_action = None
        self._learning_rate = .7
        self._discount = .9
        self._epsilon = .9
        self._learning = True

    def reset_hand(self):
        self._hand = []
        self._last_state = None
        self._last_action = None

    def get_action(self, state):
        if state in self._Q and np.random.uniform(0,1) < self._epsilon:
            action = max(self._Q[state], key = self._Q[state].get)
        else:
            action = np.random.choice(actions)
            if state not in self._Q:
                self._Q[state] = {}
            self._Q[state][action] = 0

        self._last_state = state
        self._last_action = action

        return action

    def update(self,new_state,reward):
        if self._learning:
            old = self._Q[self._last_state][self._last_action]

            if new_state in self._Q:
                new = self._discount * self._Q[new_state][max(self._Q[new_state], key=self._Q[new_state].get)]
            else:
                new = 0

            self._Q[self._last_state][self._last_action] = (1-self._learning_rate)*old + self._learning_rate*(reward+new)
    
    def display_Q(self):
        return self._Q

In [33]:
from game import AirHockey
env = AirHockey()

In [37]:
env.observe()

{'puck': (100, 100), 'player': (335, 235), 'opponent': (843, 84), 'reward': -1}

In [34]:
learner = Learner(env)

In [65]:
learner.move((300, 80))

In [77]:
learner.location()

(350, 240)

In [133]:
env.reset()

In [35]:
action = str(np.random.choice(actions))
# type(action)
for _ in range(1000):
    # Update game state
    learner.move(action)

    # Observe state
    data = env.observe()

    # Determine next action
    action = learner.get_action(data.get("player"))

    # Update state
    learner.update(data.get("player"), data.get("reward"))

In [36]:
learner.display_Q()

{(350, 239): {'U': -1.9564299999999997},
 (350, 240): {'U': -4.404181903349599, 'D': -1.3509999999999998, 'R': -0.7},
 (350, 241): {'L': -6.63299137948386, 'D': -3.5300981655099997},
 (351, 241): {'R': -6.379560623100925, 'D': -2.5194798999999994, 'L': -0.7},
 (351, 240): {'U': -2.5194798999999994},
 (352, 241): {'L': -0.7},
 (353, 241): {'L': -0.7},
 (354, 241): {'L': -0.7},
 (355, 241): {'U': -3.5300981655099997},
 (355, 242): {'U': -3.0431163069999996,
  'D': -3.0431163069999996,
  'R': -3.0431163069999996},
 (355, 243): {'D': -3.9829912939243,
  'R': -3.0431163069999996,
  'L': -2.5194798999999994},
 (354, 242): {'U': -1.9564299999999997, 'L': -1.3509999999999998},
 (354, 243): {'R': -4.404181903349599,
  'L': -3.9829912939243,
  'U': -1.9564299999999997},
 (353, 243): {'L': -3.0431163069999996, 'U': -2.5194798999999994},
 (356, 243): {'R': -2.5194798999999994},
 (353, 244): {'D': -2.5194798999999994},
 (354, 244): {'U': -1.9564299999999997},
 (354, 245): {'L': -3.0431163069999996}