In [5]:
from __future__ import division 

import gym
import numpy as np
import random
import tensorflow as tf 
import tensorflow.contrib.slim as slim
import scipy.misc
import os

In [7]:
class DQN():
    def __init__(self, netName, inputDim, outputDim, hiddenDim, numFrames, learningRate):
        self.netName = name
        self.inputDim = inputDim
        self.outputDim = outputDim
        self.hiddenDim = hiddenDim
        self.numFrames = numFrames
        self.learningRate = learningRate
        self.sess
        
    def build_network(self, session):

        with tf.variable_scope(self.netName):
            self.sess = session
            
            self.state = tf.placeholder(shape=[None, numFrames, self.inputDim], dtype=tf.float32)
            self.inputState = tf.reshape(state, [-1, self.numFrames * self.inputDim])

            # Weights of each layer
            self.W = {
                'W1': self.init_weight("W1", [self.numFrames * self.inputDim, self.hiddenDim]),
                'W2': self.init_weight("W2", [self.hiddenDim, self.hiddenDim]),
                'W3': self.init_weight("W3", [self.hiddenDim, self.hiddenDim]),
                'AW': self.init_weight("AW", [self.hiddenDim//2, self.hiddenDim]),
                'VM': self.init_weight("AW", [self.hiddenDim//2, 1])
            }

            self.hidden1 = tf.nn.relu(tf.matmul(self.inputState, self.W['W1']))
            self.hidden2 = tf.nn.relu(tf.matmul(self.hidden1, self.W['W2']))
            self.hidden3 = tf.nn.relu(tf.matmul(self.hidden2, self.W['W3']))
            
            '''
            # Uncomment this block to implement dropout 
            self.dropProb = 0.0
            self.hidden1 = tf.nn.dropout(self.hidden1, self.dropProb)
            self.hidden2 = tf.nn.dropout(self.hidden2, self.dropProb)
            self.hidden3 = tf.nn.dropout(self.hidden3, self.dropProb)
            '''

            # Compute the Advantage, Value, and total Q value
            self.A, self.V = tf.split(self.hidden3, 2, 1)
            self.Advantage = tf.matmul(self.A, self.AW)
            self.Value = tf.matmul(self.V, self.VM)
            self.Qout = self.Value + tf.subtract(self.Advantage, tf.reduce_mean(self.Advantage, axis=1, keep_dims=True))
            
            # Calcultate the action with highest Q value
            self.predict = tf.argmax(self.Qout, 1)

            # Compute the loss (sum of squared differences)
            self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.actionsOneHot = tf.one_hot(self.actions, actionSize, dtype=tf.float32)

            self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actionsOneHot), axis=1)
            self.tdError = tf.square(self.targetQ - self.Q)
            self.loss = tf.reduce_mean(self.td_error)

            self.trainer = tf.train.AdamOptimizer(learningRate)
            self.updateModel = self.trainer.minimize(self.loss)
            
    def init_weight(self, name, shape):
        return tf.get_variable(name=name, shape=shape, 
                               initializer=tf.contrib.layers.xavier_initializer())
    def predict(self, state):
        return sess.run(self.predict, feed_dict={self.state:[state]})[0]
    
    def update(self, stateBatch, targetQ, actionBatch):
        return sess.run(self.updateModel, \
                        feed_dict={self.state:stateBatch, self.targetQ:targetQ, self.actions:actionBatch})


In [8]:
class Agent():
    def __init__(self, stateSize = 10, actionSize = 6, numFrames = 4, 
                 batchSize = 32, hSize = 256, learningRate = 0.001, 
                 batchAccumulator = 1, updateFreq = 5, y = 9,
                 numEpisodes = 10000, preTrainSteps = 10000, 
                 maxEpLength = 500, tau = 0.001, lode_model = False, ckptPath = "./checkpoints"):
        
        self.stateSize=stateSize # Size of the state vector
        self.actionSize=actionSize # Number of actions
        self.numFrames=numFrames # Number of consecutive state frames
        self.batchSize=batchSize # Size of the experience sample 
        self.batchAccumulator = batchAccumulator # Operation for reward-over-time calculation
        self.hSize = hSize # Size of the hidden layers
        self.updateFreq = update_freq # Frecuency of weight updates
        self.y = .99 # Discount factor on the target Q-values
        self.numEpisodes = numEpisodes # Number of game environmet episodes in which we train
        self.preTrainSteps = preTrainSteps #
        self.maxEpLength = maxEpLength
        tf.reset_default_graph()

        self.mainQN = DQN("mainQN", self.stateSize, self.actionSize, 
                                         self.hSize, self.numFrames, self.learningRate)
        
        self.targetQN = DQN("targetQN", self.stateSize, self.actionSize, 
                                           self.hSize, self.numFrames, self.learningRate)
        
        self.exp = Experience()
        self.stepRecord = []
        self.rewardRecord = []
        self.totalSteps = 0
       
        self.init = tf.global_variables_initializer()
        
        #self.saver = tf.train.Saver()
        self.saver = ModelSaver(ckptPath) 

        trainables = tf.trainable_variables()
        self.targetOps = updateTargetGraph(trainables, tau)
   

    def update_target_graph(self, tfVars, tau):
        total_vars = len(tfVars)
        op_holder = []
    
        for idx,var in enumerate(tfVars[0:total_vars//2]): # Select the first half of the variables (mainQ net) 
            op_holder.append( tfVars[idx+total_vars//2].assign((var.value()*tau)+((1-tau)*tfVars[idx+total_vars//2].value()))) 
    
        return op_holder

    # originally, name is 'get_q'
    def update_weight(self, sess):
        self.trainBatch = self.exp.sample(self.batchSize)
        Q1 = sess.run(self.mainQN.predict, feed_dict={self.mainQN.input:np.vstack(trainBatch[:,3])})
        Q2 = sess.run(self.targetQN.Qout, feed_dict={self.targetQN.input:np.vstack(trainBatch[:,3])})
        end_multiplier = -(trainBatch[:,4] - 1)
        doubleQ = Q2[range(self.batchSize),Q1]
        targetQ = trainBatch[:,2] + (self.y*doubleQ * end_multiplier)
        self.mainQN.update(np.vstack(trainBatch[:,0]), targetQ, trainBatch[:,1])

    def exploit_exploration(self, e):
        #  
        stepDrop = 1/10000
        return e - stepDrop
    
    def train(self):
        with tf.Session() as sess:
            self.mainQN.build_network(sess)
            self.targetQN.build_network(sess) 
            
            sess.run(self.init)
            
            if load_model == True:
                print('Loading Model...')
                self.saver.restore_model(sess)
                
            self.updateTarget(self.targetOps, sess) #Set the target network to be equal to the primary network.
           
            e = 1.
            endE = 0.1
            
            for episode in range(self.numEpisodes):
                if episode%100 == 0:
                    print("\n=====" + "Episode " + str(i) + "start =====" )
                    
                episodeBuffer = Experience()
                #Reset environment and get first new observation
                s = env.reset()
                d = False
                episodeRewordSum = 0 # previously rAll 
                episodeSteps = 0 # previously j
                
                #The Q-Network
                while episodeSteps < self.maxEpLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
                    episodeSteps+=1
                    #Choose an action by greedily (with e chance of random action) from the Q-network
                    if np.random.rand(1) < e or total_steps < pre_train_steps:
                        a = np.random.randint(0,4)
                    else:
                        a = self.mainQN.predict(s)

                    s1,r,d = env.step(a)
                    self.totalSteps += 1
                    episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.

                    if self.totalSteps > self.preTrainSteps:
                        if e > endE:
                            e = exploit_exploration(e)
                            
                        if self.totalSteps % (self.updateFreq) == 0:
                            loss = self.update_weight(sess) #Update the network with our target values.
                            updateTarget(self.targetOps,sess) #Set the target network to be equal to the primary network.
                    
                    episodeRewordSum += r
                    s = s1

                    if d == True:
                        break

                self.exp.add(episodeBuffer.buffer)
                self.stepRecord.append(j)
                self.rewardRecord.append(rAll)
                
                #Periodically save the model. 
                if episode % 1000 == 0:
                    self.saver.save_model(sess)
                    print("Saved Model")
                if len(self.rewardRecord) % 10 == 0:
                    print(self.totalSteps, np.mean(self.rewardRecord[-10:]), e)
            
            self.saver.save_model(sess)
            print("Percent of succesful episodes: " + str(sum(self.rewardRecord)/self.numEpisodes) + "%")

   # def test(self):


In [9]:
class Experience():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
        
    def add(self, experience):
        
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience) + len(self.buffer)) - self.buffer_size] = []
        
        self.buffer.extend(experience)
        
    def sample(self, size):
        return np.reshape(np.array(random.sample(self.buffer, size)),[size,5])

In [10]:
class ModelSaver():
    def __init__(self, path):
        self.saver = tf.train.Saver()
        self.ckptPath = path
        if not os.path.exists(path):
            os.makedirs(path)
    
    def restore_model(self, sess):
        ckpt = tf.train.get_checkpoint_state(self.ckptPath)
        self.saver.restore(sess, ckpt.model_checkpoint_path)
    
    def svae_model(self, sess): 
        self.saver.save(sess, self.ckptPath+'/model-'+str(i)+'.cptk')

In [None]:
def main():
    agent = Agent('''insert arguments here''')
    agent.train()
    agent.test()

if __name__ == '__main__':
    main()