In [14]:
import gym
import numpy as np
import random
import time
from keras.models import Sequential, Model
from keras.layers import Convolution2D, Flatten, Dense, Input, merge
from keras.optimizers import Adam

%matplotlib inline
from matplotlib import pyplot as plt
from skimage.color import rgb2gray
from skimage.transform import resize
from collections import deque

def image_preprocess(obs, normalize=True):
    if normalize:
        nor = 255
    else:
        nor = 1
        
    new_obs = resize(obs, (FRAME_WIDTH, FRAME_HEIGHT))
    new_obs = np.uint8(rgb2gray(new_obs) * nor)
    
    return new_obs

def print_obs(obs):
    fig = plt.figure()
    plt.imshow(obs, interpolation='nearest')

ENVIRONMENTS = 'Enduro-v0'#Kangaroo-v0
FRAME_WIDTH = 84  # Resized frame width
FRAME_HEIGHT = 84  # Resized frame height
NUM_ACTIONS = 18
NUM_FRAMES = 4 # The number of most recent frames experienced by the agent that are givin as input to the Q network
INITIAL_EPSILON = 1.0
FINAL_EPSILON = 0.1
NUM_EPISODES = 20

DQN_LEARNING_RATE = 0.0025 # PAPER2 The learning rate ysed
DQN_UPDATE_TARGET = 10000 # PAPER2 The frequency with which the target network is updated
DQN_DECAY_RATE = 0.99

MEMORY_BATCH_SIZE = 32 # Number of training cases over which each (SGD) update is computed
MEMORY_SIZE = 10000
MEMORY_MIN_2_LEARN = 100

env = gym.make(ENVIRONMENTS)
NUM_ACTIONS = env.action_space.n

env.reset()
for i, a in enumerate(env.unwrapped.get_action_meanings()):
    print i, a

0 NOOP
1 FIRE
2 RIGHT
3 LEFT
4 DOWN
5 DOWNRIGHT
6 DOWNLEFT
7 RIGHTFIRE
8 LEFTFIRE


In [2]:
class ReplayExperience:
    """Reinforcement Learning for Robots Using Neural Networks
    [http://www.dtic.mil/docs/citations/ADA261434]
    
    Buffer to stores the past moves, states and rewards
    """
    def __init__(self, experience_size):
        self.count = 0 # number of registers
        self.size = experience_size # max size
        self.memory = deque() # buffer
    
    def add(self, state_old, action, reward, state, done):
        experience = (state_old, action, reward, state, done)        
        if self.count < self.size:            
            self.count += 1
        else:
            self.memory.popleft()
            
        self.memory.append(experience)
    
    def sample(self, batch_size):
        sample = []
        if self.count < batch_size:
            sample = random.sample(list(self.memory), self.count)
        else:
            sample = random.sample(list(self.memory), batch_size)
            
        batch_state_old, batch_action, batch_reward, batch_state, batch_done = map(np.array, zip(*sample))
        return batch_state_old, batch_action, batch_reward, batch_state, batch_done     

In [3]:
def build_model1():
    model = Sequential()
    input_layer = Input(shape=(FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES))
    conv1 = Convolution2D(16, (8,8), strides=(4, 4), activation='relu')(input_layer)#data_format='channels_first'
    conv2 = Convolution2D(32, (4,4), strides=(2, 2), activation='relu')(conv1)
    flatten = Flatten()(conv2)
    fc1 = Dense(256, activation='relu')(flatten)
    fc2 = Dense(NUM_ACTIONS)(fc1)
    
    model = Model(inputs=[input_layer], outputs=[fc2])
    model.compile(loss='mse', optimizer=Adam(lr=0.00025))
    
    return model

def build_model2():
    model = Sequential()
    input_layer = Input(shape=(FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES))
    conv1 = Convolution2D(32, (8,8), strides=(4, 4), activation='relu')(input_layer)#data_format='channels_first'
    conv2 = Convolution2D(64, (4,4), strides=(2, 2), activation='relu')(conv1)
    conv3 = Convolution2D(64, (3,3), strides=(1, 1), activation='relu')(conv2)
    flatten = Flatten()(conv3)
    fc1 = Dense(512, activation='relu')(flatten)
    fc2 = Dense(NUM_ACTIONS)(fc1)
    
    model = Model(inputs=[input_layer], outputs=[fc2])
    model.compile(loss='mse', optimizer=Adam(lr=0.00025))
    
    return model

def predict_action(model, state, epsilon):    
    if  np.random.random() <= epsilon:
        action = random.randrange(NUM_ACTIONS)
    else:
        predict = model.predict(state.reshape(1, FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES), batch_size = 1)
        action = np.argmax(predict)
    
    categorical_action = np.zeros(NUM_ACTIONS)
    categorical_action[action] = 1
    
    return action, categorical_action

In [4]:
model = build_model1()

In [10]:
def learn(b_state_old, b_action, b_reward, b_state, b_done):
    """Train DQN
    """
    for m in range(len(b_state_old)):
        action = model.predict(b_state_old[m].reshape(1, FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES), batch_size = 1)
        
        future_action = model.predict(b_state[m].reshape(1, FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES), batch_size = 1)
        if b_done[m]:
            b_action[m] = action + (b_action[m]*-action) + (b_action[m]* b_reward[m])
        else:  
            b_action[m] = action + (b_action[m]*-action) + (b_action[m]* DQN_DECAY_RATE * np.max(future_action))
    loss = model.train_on_batch(b_state_old, b_action)
    
    #print("We had a loss equal to ", loss)
    
def learn2(b_state_old, b_action, b_reward, b_state, b_done):
    """Train DQN
    """
    batch_size = b_state_old.shape[0]
    targets = np.zeros((batch_size, NUM_ACTIONS))
        
    for m in range(batch_size):
        targets[m] = model.predict(b_state_old[m].reshape(1, FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES), batch_size = 1)
        future_action = model.predict(b_state[m].reshape(1, FRAME_WIDTH, FRAME_HEIGHT, NUM_FRAMES), batch_size = 1)
        
        targets[i, b_action[m]] = b_reward[m]
        if not b_done[m]:
            targets[i, b_action[m]] += DQN_DECAY_RATE * np.max(future_action)
            
    loss = model.train_on_batch(b_state_old, targets)
    
    #print("We had a loss equal to ", loss)
        
    
def _prepare_action():
    pass

In [17]:
env.reset()
env.close()
env.render(close=True)

env.reset()
memory = ReplayExperience(MEMORY_SIZE)

for episode in range(NUM_EPISODES):
    env.reset()
    reward, done, state = 0, False, []
    for i in range(NUM_FRAMES):
        _observation, _reward, _done, _info =  env.step(0)
        reward += _reward
        state.append(image_preprocess(_observation))
        done = done | _done
    
    state_old = np.stack(state, axis=-1)
    
    while not done:
        start = time.time()
        # Render screen emulator
        env.render()

        # RL choose action based on observation of state or random action (exploitation or exploration)
        action, cat_action = predict_action(model, state_old, 0.1)

        # RL take action and get next state and reward
        reward, done, state = 0, False, []
        for i in range(NUM_FRAMES):
            # Take more frames to remove flickering
            _observation, _reward, _done, _info =  env.step(action)
            reward += _reward # acumulate reward
            state.append(image_preprocess(_observation)) # add preprocess frame to state
            done = done | _done
        state = np.stack(state, axis=-1)

        # Storing information to Replay experience
        memory.add(state_old, action, reward, state, done)
        
        # Learning process
        if memory.count >= MEMORY_MIN_2_LEARN:
            batch_state_old, batch_action, batch_reward, batch_state, batch_done = memory.sample(MEMORY_MIN_2_LEARN)
            learn2(batch_state_old, batch_action, batch_reward, batch_state, batch_done)

        # Change old state
        state_old = state
        end = time.time()
        print end-start

env.reset()
env.close()
env.render(close=True)

0.183672904968
0.0195140838623
0.0168650150299
0.0186860561371
0.0178859233856
0.0179131031036
0.0184330940247
0.0170440673828
0.0179100036621
0.0182311534882
0.0179970264435
0.0178260803223
0.0291740894318
0.0312700271606
0.0177628993988
0.017077922821
0.0183980464935
0.017725944519
0.0174279212952
0.0169789791107
0.0176820755005
0.0179560184479
0.0178329944611
0.0176210403442
0.0281488895416
0.0339288711548
0.0160450935364
0.0172090530396
0.0177800655365
0.0183379650116
0.0181760787964
0.0185630321503
0.019257068634
0.0282380580902
0.0177710056305
0.041738986969
0.0243060588837
0.0185890197754
0.0183489322662
0.020005941391
0.0190658569336
0.0180721282959
0.0180490016937
0.0183129310608
0.0157239437103
0.0165162086487
0.0226731300354
0.0316481590271
0.0291588306427
0.02143907547
0.0179719924927
0.0188989639282
0.0155389308929
0.0185439586639
0.0168199539185
0.0173790454865
0.0180990695953
0.0219509601593
0.0287249088287
0.0262489318848
0.0189409255981
0.0214929580688
0.0202269554138


KeyboardInterrupt: 

In [19]:
env.reset()
env.close()
env.render(close=True)

env.reset()
memory = ReplayExperience(MEMORY_SIZE)

for episode in range(NUM_EPISODES):
    env.reset()
    reward, done, state = 0, False, []
    for i in range(NUM_FRAMES):
        _observation, _reward, _done, _info =  env.step(0)
        reward += _reward
        state.append(image_preprocess(_observation))
        done = done | _done
    
    state_old = np.stack(state, axis=-1)
    count = 0
    while not done:
        start = time.time()
        # Render screen emulator
        #env.render()

        # RL choose action based on observation of state or random action (exploitation or exploration)
        action, cat_action = predict_action(model, state_old, 0.1)

        # RL take action and get next state and reward
        reward, done, state = 0, False, []
        for i in range(NUM_FRAMES):
            # Take more frames to remove flickering
            _observation, _reward, _done, _info =  env.step(action)
            reward += _reward # acumulate reward
            state.append(image_preprocess(_observation)) # add preprocess frame to state
            done = done | _done
        state = np.stack(state, axis=-1)

        # Storing information to Replay experience
        memory.add(state_old, cat_action, reward, state, done)
        
        # Learning process
        if memory.count >= MEMORY_MIN_2_LEARN:
            if count % 20 == 0:
                batch_state_old, batch_action, batch_reward, batch_state, batch_done = memory.sample(MEMORY_MIN_2_LEARN)
                learn(batch_state_old, batch_action, batch_reward, batch_state, batch_done)

        # Change old state
        state_old = state
        
        end = time.time()
        print end-start
        count += 1

env.reset()
env.close()
env.render(close=True)

0.0214509963989
0.0181210041046
0.0220382213593
0.019073009491
0.0197479724884
0.0214629173279
0.0217709541321
0.022500038147
0.019492149353
0.0225529670715
0.0244069099426
0.0251159667969
0.023866891861
0.0168471336365
0.0164649486542
0.0167880058289
0.0159540176392
0.0172219276428
0.0160179138184
0.0164561271667
0.0161759853363
0.0157840251923
0.015594959259
0.0262100696564
0.0306479930878
0.016909122467
0.016658782959
0.0172579288483
0.0169188976288
0.0167601108551
0.0163400173187
0.0163781642914
0.0168759822845
0.0162291526794
0.0166199207306
0.0238780975342
0.0285890102386
0.0194401741028
0.0160748958588
0.0161769390106
0.0164461135864
0.0160510540009
0.0167541503906
0.0164959430695
0.016478061676
0.0160391330719
0.0172579288483
0.0249710083008
0.0273878574371
0.0177748203278
0.0161218643188
0.0164320468903
0.0155429840088
0.0160710811615
0.0142991542816
0.0160269737244
0.016223192215
0.0168340206146
0.0164201259613
0.0263321399689
0.0294511318207
0.0187981128693
0.0155029296875
0

KeyboardInterrupt: 

In [80]:
b_state_old, b_action, b_reward, b_state, b_done = memory.sample(32)
learn(b_state_old, b_action, b_reward, b_state, b_done)

('We had a loss equal to ', 0.8977414)


In [10]:
np.zeros(NUM_ACTIONS)

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [38]:
a1 = np.array([1, 1])
a2 = np.array([2, 2])
print a1*3

[3 3]


In [59]:
memory.sample(1)[3].shape

(1, 4, 84, 84)

In [67]:
memory.count

100