In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, random
path, _ = os.path.split(os.getcwd())
sys.path.append(path)

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

def sxs(images):
    plt.figure(figsize=(20, 20))
    for i in range(len(images)):
        images[i] = np.rot90(images[i])
        plt.subplot(1, len(images), i+1)
        plt.imshow(images[i], cmap='gray')

    plt.tight_layout()
    plt.show()

In [3]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop

import numpy as np
import pandas as pd

Using TensorFlow backend.


In [7]:
# Set up environment
from learners import DQNLearner
from environment import AirHockey
env = AirHockey()

In [16]:
class DQNLearner(Learner):
        def __init__(self, env):
            super().__init__(env)
            self._learning = True
            self._learning_rate = .1
            self._discount = .1
            self._epsilon = .9
    
            # Create Model
            model = Sequential()
    
            model.add(Dense(2, kernel_initializer='lecun_uniform', input_shape=(2,)))
            model.add(Activation('relu'))
    
            model.add(Dense(10, kernel_initializer='lecun_uniform'))
            model.add(Activation('relu'))
    
            model.add(Dense(4, kernel_initializer='lecun_uniform'))
            model.add(Activation('linear'))
    
            rms = RMSprop()
            model.compile(loss='mse', optimizer=rms)
    
            self._model = model
        
        def make_action(position):
            """ Convert position into state space """

            action = np.zeros(shape=(env.table_midpoints[0], env.table_size[1]))
            action[position[0], position[1]] = 1
            return action
        
        def get_action(self, state):
            rewards = self._model.predict([np.array([state])], batch_size=1)

            if np.random.uniform(0,1) < self._epsilon:
                if rewards > 0.5:
                    action = state
                else:
                    pos = random.randint(0, self.env().table_midpoints[0]-1), random.randint(0, self.env().table_size[1] -1)
                    action = self.make_action(pos)
            else:
                pos = random.randint(0, self.env().table_midpoints[0]-1), random.randint(0, self.env().table_size[1] -1)
                action = self.make_action(pos)
                
            return action
        
        def update(self,new_state,reward):
            if self._learning:
                rewards = self._model.predict(new_state, batch_size=1)
                maxQ = rewards if rewards[0][0] > rewards[0][1] else rewards[0][1]
                new = self._discount * maxQ

                if self._last_action == new_state:
                    self._last_target[0][0] = reward+new
                else:
                    self._last_target[0][1] = reward+new

                # Update model
                self._model.fit(np.array([self._last_state]), 
                                self._last_target, 
                                batch_size=1, 
                                nb_epoch=1, 
                                verbose=0)

In [8]:
learner = DQNLearner(env)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4)                 12        
_________________________________________________________________
activation_1 (Activation)    (None, 4)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                50        
_________________________________________________________________
activation_2 (Activation)    (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                330       
_________________________________________________________________
activation_3 (Activation)    (None, 30)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 30)                930       
__________

In [16]:
learner.load("test.h5")

In [18]:
env.reset()
env.observe()

{'puck': (100, 100),
 'player': (350, 240),
 'opponent': (550, 240),
 'reward': -200}

In [20]:
action = str(np.random.choice(actions))
# type(action)
for _ in range(1000):
    # Update game state
    learner.move(action)

    # Observe state
    data = env.observe()

    # Determine next action
    action = learner.get_action(data.get("player"))
    print(action)
    # Update state
    learner.update(data.get("player"), data.get("reward"))

[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0.



[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
L
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
U
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
D
[[0. 0. 0. 0.]]
None
[[0. 0. 0. 0.]]
R
[[0. 0.