In [1]:
import tensorflow as tf
import keras
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras import initializers
from keras.optimizers import Adam
import json
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD , Adam
import tensorflow as tf
import skimage
from skimage import color, exposure, transform

Using TensorFlow backend.


In [2]:
env = gym.make('PongDeterministic-v4')

[2017-12-19 15:17:26,980] Making new env: PongDeterministic-v4


In [3]:
ACTIONS = env.action_space.n # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVATION = 30000. # timesteps to observe before training
EXPLORE = 700000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
INITIAL_EPSILON = 1 # starting value of epsilon
REPLAY_MEMORY = 30000 # number of previous transitions to remember
BATCH = 64 # size of minibatch
LEARNING_RATE = 1e-3

img_rows , img_cols = 80, 80
#Convert image into Black and white
img_channels = 3 #We stack 3 frames

max_epLength = 3000

update_freq = 4

NUM_EPISODES = 5000

MODEL_NAME = "breakout_v4_model.h5"

tau = 0.01

In [4]:
def build_model():
    
    initializer = initializers.RandomNormal(mean=0.0, stddev=0.1, seed=None)
    
    model = Sequential()
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same',input_shape=(img_rows,img_cols,img_channels), kernel_initializer=initializer, bias_initializer='zeros'))  #80*80*4
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same', kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same', kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512, kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dense(ACTIONS, kernel_initializer=initializer, bias_initializer='zeros'))

    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    print("We finish building the model")
    return model

In [5]:
class Wrapped_Game:
    def __init__(self, game):
        self.game = game
        self.game.reset()
    def step(self, action):
        ns, r, d, _ = self.game.step(action)
        if d:
            self.game.reset()
        return ns, r, d, _
    def reset(self):
        self.game.reset()
    def render(self):
        self.game.render()
    def close(self):
        self.game.close()

In [6]:
def copy_model(model):
    """Returns a copy of a keras model."""
    model.save('tmp_model')
    return keras.models.load_model('tmp_model')

In [7]:
def clear_session(model, target_model):
    from keras import backend as K
    model_path = 'tmp_model_name_ddqn'
    model.save(model_path)
    del model
    target_model_path = 'tmp_target_model_name_ddqn'
    target_model.save(target_model_path)
    del target_model
    K.clear_session()
    model = keras.models.load_model(model_path)
    target_model = keras.models.load_model(target_model_path)
    return model, target_model

In [8]:
def assign_linear_comb(m, tm, tau):
    import tensorflow as tf
    from keras import backend as K
    '''Sets the value of a tensor variable,
    from a Numpy array.
    '''
    #tf.assign(x, np.asarray(value)).op.run(session=get_session())
    assign_op = tm.assign(m.value() * tau + (1-tau) * tm.value())
    #K.get_session().run(assign_op, feed_dict={assign_placeholder: value})
    return assign_op

def update_target_graph(target_model, model, tau):
    var_assign_ops = []
    for idxLayer in range(len(model.layers)):
        model_layer = model.layers[idxLayer]
        target_model_layer = target_model.layers[idxLayer]
        for idxWeight in range(len(model_layer.weights)):
            var_assign_ops.append(
                assign_linear_comb(model_layer.weights[idxWeight], target_model_layer.weights[idxWeight], tau)
            )
    return var_assign_ops
    
def update_target(var_assign_ops):
    from keras import backend as K
    for var_assign_op in var_assign_ops:
        K.get_session().run(var_assign_op)
    """for idxLayer in range(len(model.layers)):
        model_layer = model.layers[idxLayer]
        target_model_layer = target_model.layers[idxLayer]
        for i, model_weight, target_model_weight in zip(range(len(model_layer.weights)), model_layer.get_weights(), target_model_layer.get_weights()):
            new_weight = tau * model_weight + (1 - tau) * target_model_weight
            set_value(target_model_layer.weights[i], new_weight)
            """
#target_model = copy_model(model)
#var_assign_ops = update_target_graph(target_model, model, tau)
#update_target(var_assign_ops)

In [10]:
import time

TIMESTEP = 0

def train_model(model, env):
    
    target_model = build_model()
    var_assign_ops = update_target_graph(target_model, model, tau)
    
    #init replay memory
    M = deque()
 
    OBSERVE = OBSERVATION
    epsilon = INITIAL_EPSILON

    t = 0
    
    for idxEpisode in range(NUM_EPISODES):
        #Reset environment and get first new observation
        x_t = env.reset()
        x_t = skimage.color.rgb2gray(x_t)
        x_t = skimage.transform.resize(x_t,(img_rows,img_cols))
        x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
        x_t = x_t.reshape((img_rows, img_cols, 1))
        s_t = np.stack((x_t, x_t, x_t), axis=2)
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])
        d = False
        rAll = 0
        j = 0
        #The Q-Network
        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            j+=1
            a_t = None
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < epsilon or t < OBSERVE:
                a_t = random.randrange(ACTIONS)
            else:
                q = model.predict(s_t)
                policy_max_Q = np.argmax(q)
                a_t = policy_max_Q
            x_t1,r_t,done,_ = env.step(a_t)
            x_t1 = skimage.color.rgb2gray(x_t1)
            x_t1 = skimage.transform.resize(x_t1,(img_cols,img_rows))
            x_t1 = skimage.exposure.rescale_intensity(x_t1,out_range=(0,255))
            x_t1 = x_t1.reshape((1, img_cols, img_rows, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :, :2], axis=3)
            
            t += 1
            TIMESTEP = t
            M.append((s_t, a_t, r_t, s_t1, done))
            if (len(M) > REPLAY_MEMORY):
                M.popleft()
            
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
                
                if t % (update_freq) == 0:
                    minibatch = random.sample(M, BATCH)
                    inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))
                    targets = np.zeros((BATCH, ACTIONS))
                    #Below we perform the Double-DQN update to the target Q-values
                    #Q1 = model.predict(s_t1)
                    #Q2 = target_model.predict(s_t1)
                    #end_multiplier = -(trainBatch[:,4] - 1)
                    #doubleQ = Q2[range(batch_size),Q1]
                    #targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    #Update the network with our target values.
                    #_ = sess.run(mainQN.updateModel, \
                    #    feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})
                    
                    # experience replay
                    for i in range(0, BATCH):
                        state_t = minibatch[i][0]
                        action_t = minibatch[i][1]
                        reward_t = minibatch[i][2]
                        state_t1 = minibatch[i][3]
                        done_t = minibatch[i][4]

                        inputs[i] = state_t
                        #print (inputs[i])
                        #print (state_t)
                        targets[i] = model.predict(state_t)
                        # DDQN formula
                        # Q-Target = r + γQ(s’,argmax(Q(s’,a,ϴ),ϴ’))
                        #print(targets[i].shape)
                        Q_sa = target_model.predict(state_t1)
                        if done_t:
                            targets[i, action_t] = reward_t
                        else:
                            targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa[0])#[action_t]

                    model.train_on_batch(inputs, targets)
                    time1 = time.time()
                    update_target(var_assign_ops)
                    time2 = time.time()
                    #if t % 5 == 0:
                    #    print('%s function took %0.3f ms at iteration %d' % ('updateTarget', (time2-time1)*1000.0, t))
                    #updateTarget(targetOps,sess) #Update the target network toward the primary network.
            rAll += r_t
            s_t = s_t1
            
            if done == True:
                break
            
            #if t % 10000:
            #    clear_session(model, target_model)
        print('episode', idxEpisode, 'length', j, 'reward', rAll, 'epsilon', epsilon)

In [11]:
model = build_model()

We finish building the model


  
  
  # Remove the CWD from sys.path while we load stuff.


In [12]:
#load model
#model.load_weights('1100000_iters_breakout_deterministic_model_atari_ddqn_more_explore.h5')
#model.load_weights('backup_pong_lr-4/250000_iters_pdv4_ddqn_lr-4_tmr_100_after_500000.h5')

In [13]:
train_model(model, env)

  
  
  # Remove the CWD from sys.path while we load stuff.


We finish building the model


  warn("The default mode, 'constant', will be changed to 'reflect' in "


episode 0 length 978 reward -20.0 epsilon 1
episode 1 length 903 reward -20.0 epsilon 1
episode 2 length 764 reward -21.0 epsilon 1
episode 3 length 824 reward -21.0 epsilon 1
episode 4 length 1037 reward -19.0 epsilon 1
episode 5 length 764 reward -21.0 epsilon 1
episode 6 length 871 reward -21.0 epsilon 1
episode 7 length 881 reward -21.0 epsilon 1
episode 8 length 1069 reward -20.0 epsilon 1
episode 9 length 764 reward -21.0 epsilon 1
episode 10 length 824 reward -21.0 epsilon 1
episode 11 length 1067 reward -20.0 epsilon 1
episode 12 length 842 reward -20.0 epsilon 1
episode 13 length 964 reward -20.0 epsilon 1
episode 14 length 949 reward -20.0 epsilon 1
episode 15 length 839 reward -21.0 epsilon 1
episode 16 length 852 reward -21.0 epsilon 1
episode 17 length 884 reward -21.0 epsilon 1
episode 18 length 792 reward -21.0 epsilon 1
episode 19 length 764 reward -21.0 epsilon 1
episode 20 length 886 reward -21.0 epsilon 1
episode 21 length 1069 reward -20.0 epsilon 1
episode 22 lengt

episode 141 length 889 reward -20.0 epsilon 0.8575842571395604
episode 142 length 1087 reward -20.0 epsilon 0.8560469285680963
episode 143 length 949 reward -20.0 epsilon 0.8547047714252081
episode 144 length 1022 reward -20.0 epsilon 0.8532593714251746
episode 145 length 824 reward -21.0 epsilon 0.8520939999965762
episode 146 length 852 reward -21.0 epsilon 0.8508890285679769
episode 147 length 991 reward -20.0 epsilon 0.8494874714250873
episode 148 length 889 reward -20.0 epsilon 0.8482301714250582
episode 149 length 1044 reward -20.0 epsilon 0.8467536571393097
episode 150 length 1207 reward -19.0 epsilon 0.8450466142821274
episode 151 length 879 reward -20.0 epsilon 0.8438034571392414
episode 152 length 931 reward -20.0 epsilon 0.842486757139211
episode 153 length 1039 reward -20.0 epsilon 0.8410173142820341
episode 154 length 854 reward -21.0 epsilon 0.8398095142820061
episode 155 length 856 reward -21.0 epsilon 0.8385988857105495
episode 156 length 935 reward -19.0 epsilon 0.83727

episode 271 length 764 reward -21.0 epsilon 0.6924890285643102
episode 272 length 842 reward -20.0 epsilon 0.691298199992854
episode 273 length 764 reward -21.0 epsilon 0.6902176857071147
episode 274 length 902 reward -20.0 epsilon 0.6889419999927995
episode 275 length 764 reward -21.0 epsilon 0.6878614857070602
episode 276 length 871 reward -21.0 epsilon 0.6866296428498888
episode 277 length 884 reward -21.0 epsilon 0.6853794142784313
episode 278 length 764 reward -21.0 epsilon 0.684298899992692
episode 279 length 794 reward -21.0 epsilon 0.6831759571355231
episode 280 length 764 reward -21.0 epsilon 0.6820954428497839
episode 281 length 856 reward -21.0 epsilon 0.6808848142783273
episode 282 length 830 reward -21.0 epsilon 0.6797109571354429
episode 283 length 933 reward -21.0 epsilon 0.6783914285639838
episode 284 length 870 reward -20.0 epsilon 0.6771609999925268
episode 285 length 764 reward -21.0 epsilon 0.6760804857067875
episode 286 length 930 reward -20.0 epsilon 0.67476519999

episode 402 length 919 reward -20.0 epsilon 0.5298334428462592
episode 403 length 783 reward -21.0 epsilon 0.5287260571319479
episode 404 length 843 reward -21.0 epsilon 0.5275338142747774
episode 405 length 912 reward -21.0 epsilon 0.526243985703319
episode 406 length 926 reward -20.0 epsilon 0.5249343571318601
episode 407 length 824 reward -21.0 epsilon 0.5237689857032617
episode 408 length 792 reward -21.0 epsilon 0.5226488714175215
episode 409 length 926 reward -20.0 epsilon 0.5213392428460626
episode 410 length 782 reward -21.0 epsilon 0.5202332714174656
episode 411 length 1024 reward -20.0 epsilon 0.5187850428460035
episode 412 length 948 reward -21.0 epsilon 0.5174442999888296
episode 413 length 820 reward -21.0 epsilon 0.5162845857030884
episode 414 length 895 reward -20.0 epsilon 0.5150187999887734
episode 415 length 842 reward -20.0 epsilon 0.5138279714173173
episode 416 length 822 reward -21.0 epsilon 0.5126654285601475
episode 417 length 898 reward -20.0 epsilon 0.511395399

KeyboardInterrupt: 

test_model(model, Wrapped_Game(env))