In [43]:
%run data_preprocessing.ipynb

In [199]:
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, Flatten, Input, Lambda, multiply
from keras.layers.convolutional import Conv2D, MaxPooling2D

from sklearn.preprocessing import MinMaxScaler

import itertools
import random

import copy

In [67]:
# states = Input(shape=(4, 80, 80), name='states')
# actions = Input(shape=(3,), name='actions')

# scaled = Lambda(lambda x:x/255)(states)

# conv1 = Conv2D(16, (8, 8), activation='relu', padding='same')(scaled)
# # pool1 = MaxPooling2D(pool_size=(4, 4), strides=(4, 4))(conv1)
# conv2 = Conv2D(32, (4, 4), activation='relu', padding='same')(conv1)
# # pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(conv2)

# flattened = Flatten()(conv2)
# dense = Dense(256, activation='relu')(flattened)
# output = Dense(3)(dense)
# filtered_output = multiply([output, actions])

# model = Model(inputs=[states, actions], outputs=filtered_output)
# model.compile(optimizer='rmsprop', loss='mse')

In [72]:
model = Sequential()

In [105]:
states, actions, rewards = get_random_data()

In [106]:
states, actions, rewards = np.array(states).reshape((len(states), 80, 80, 1)), \
                           np.array(actions), \
                           np.array(rewards).reshape((len(rewards), 1))

In [227]:
class RingBuffer:
    def __init__(self, capacity=50000):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        
    def append(self, datum):
        if len(self.memory) < self.capacity:
            self.memory.append(datum)
        else: 
            self.memory[self.position] = datum
            
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, min(batch_size, len(self.memory)))

In [228]:
buf = RingBuffer()

In [229]:
def get_epsilon():
    i = 0
    while(True):
        yield (1-0.9*(i/1000000)) if i < 1000000 else 0.1
        
        i += 1

In [334]:
ACTIONS = [2, 0, 5]
MAX_GRAYSCALE = 255.0
BATCH_SIZE = 32
UP = 2
NOOP = 0
DOWN = 5

In [324]:
model = Sequential()
model.add(Conv2D(16, (8, 8), input_shape=(80, 80, 4), activation='relu', padding='same'))
model.add(Conv2D(32, (4, 4), activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(3))
model.compile(optimizer='rmsprop', loss='mse')

In [325]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_41 (Conv2D)           (None, 80, 80, 16)        4112      
_________________________________________________________________
conv2d_42 (Conv2D)           (None, 80, 80, 32)        8224      
_________________________________________________________________
flatten_16 (Flatten)         (None, 204800)            0         
_________________________________________________________________
dense_31 (Dense)             (None, 256)               52429056  
_________________________________________________________________
dense_32 (Dense)             (None, 3)                 771       
Total params: 52,442,163
Trainable params: 52,442,163
Non-trainable params: 0
_________________________________________________________________
None


In [353]:
def q_learn(env, model, gamma=0.9, *args):
    get_ep = get_epsilon()
    done = False
    memory = RingBuffer()
    s_t = process_data(env.reset(), init=True)
    loss = 0
    
    S_t = s_t.repeat(4, axis=3)
    
    for t in itertools.count():
        if np.random.random() <= next(get_ep):
            action_idx = np.random.randint(0, 3)
        else:
            action_idx = np.argmax(model.predict(state))
            
        s_t_prime, reward, done, info = env.step(ACTIONS[action_idx])
        s_t_prime = process_data(s_t_prime, init=True)
                
        S_t_prime = np.append(s_t_prime, S_t[:,:,:,:3], axis=3)
        
        memory.append((S_t, action_idx, reward, S_t_prime, done))
        
        env.render()
        
        if t > 400:
            batch = memory.sample(BATCH_SIZE)
            
            states_t, actions_t, rewards_t, states_t_prime, done_t = [], [], [], [], []
            for state, action, reward, state_prime, done in batch:
                states_t.append(state[0])
                actions_t.append(action)
                rewards_t.append(reward)
                states_t_prime.append(state_prime[0])
                done_t.append(done)
                
            states_t = np.array(states_t)
            states_t_prime = np.array(states_t_prime)
                
            Q_sa_prime = model.predict(states_t)
            Q_sa = model.predict(states_t_prime)
            
            Q_sa_prime[:, actions_t] = rewards_t + gamma*np.max(Q_sa, axis=1)*np.invert(done_t)
            
            loss += model.train_on_batch(states_t, Q_sa_prime)
            print(loss)
        
        S_t = S_t_prime

In [354]:
env = gym.make('Pong-v0')

In [355]:
q_learn(env, model)

2.506897211074829
4.699760675430298
5.927212238311768
7.36489725112915
7.8603794276714325
8.817308157682419
9.280901163816452
9.635805875062943
9.8429656624794
10.176226019859314
10.346973285079002
10.5488810390234
11.114807114005089
11.434024915099144
11.508469045162201
11.571938633918762
11.685678631067276
11.735869988799095
11.832798905670643
11.893268540501595
11.954121999442577
12.006131909787655
12.052553858608007
12.078924801200628
12.113744542002678
12.13564833253622
12.154784049838781
12.174413915723562
12.184259906411171
12.623053535819054
12.849636435508728
12.885903976857662
12.894569186493754
12.899832791648805


KeyboardInterrupt: 

In [336]:
def process_data(data, init=False):
    cropped = data[34:-16,:]
    if init:
        return (np.mean(cropped[::2,::2,:], axis=2)/255.0).reshape((1, 80, 80, 1))
    else: 
        return (np.mean(cropped[::2,::2,:], axis=2)/255.0).reshape((80, 80, 1))

In [335]:
def one_hot_encode(action):
    if action == UP: return [1, 0, 0]
    elif action == NOOP: return [0, 1, 0]
    else: return [0, 0, 1]

In [None]:
process_data(ob).repeat(4, axis=3).shape

In [None]:
def replay(model, memory, batch_size=32):
    batch = memory.sample(batch_size)
    
    states = [_[0] for _ in batch]
    s_primes = [_[3] for _ in batch]
    
    p = model.predict()