In [None]:
from __future__ import division

import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import cv2
%matplotlib inline

env = gym.make('SpaceInvaders-v0')
env.reset()

In [None]:
def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
    observation = observation[26:110, :] # removing the first 26 rows as they only contain the score
    ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY)
    return np.reshape(observation,(84, 84, 1))


action0 = 0  # do nothing
observation0, reward0, terminal, info = env.step(action0)
print("Before processing: " + str(np.array(observation0).shape))
plt.imshow(np.array(observation0))
plt.show()
observation0 = preprocess(observation0)
print("After processing: " + str(np.array(observation0).shape))
plt.imshow(np.array(np.squeeze(observation0)))
plt.show()

In [None]:
class Double_DQN():
    def __init__(self,
                 output_size):
        self.output_size = output_size
        
        self.build_networks()
        
    def build_networks():
        self.primary_network = build_model()
        self.target_network = build_model()
        
    def build_model():
        self.input_layer = tf.placeholder([None, 84, 84, 3], dtype = tf.float32)
        
        conv1 = tf.layers.conv2d(
            inputs=input_layer,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding="valid",
            activation=tf.nn.relu)
        
        conv2 = tf.layers.conv2d(
            inputs=conv1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding="valid",
            activation=tf.nn.relu)
        
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding="valid",
            activation=tf.nn.relu)
        
        conv4 = tf.layers.conv2d(
            inputs=conv3,
            filters=self.output_size,
            kernel_size=[7, 7],
            strides=[1, 1],
            padding="valid",
            activation=tf.nn.relu)
        
#         #We take the output from the final convolutional layer and split it into separate advantage and value streams.
#         streamAC, streamVC = tf.split(conv4, 2, 3)
#         self.streamA = slim.flatten(self.streamAC)
#         self.streamV = slim.flatten(self.streamVC)
#         xavier_init = tf.contrib.layers.xavier_initializer()
#         self.AW = tf.Variable(xavier_init([h_size // 2, env.actions]))
#         self.VW = tf.Variable(xavier_init([h_size // 2, 1]))
#         self.Advantage = tf.matmul(self.streamA, self.AW)
#         self.Value = tf.matmul(self.streamV, self.VW)
        
#         #Then combine them together to get our final Q-values.
#         self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
#         self.predict = tf.argmax(self.Qout,1)
        
        self.out = conv4
        self.predict = tf.argmax(self.out, 1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions, env.actions, dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.multiply(self.out, self.actions_onehot), axis=1)
        self.loss = tf.reduce_mean(tf.square(self.targetQ - self.Q))
        self.train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(self.loss)

In [None]:
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
gamma = .99 #Discount factor on the target Q-values

num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #How many steps of random actions before training begins.
max_episode_len = 50 #The max allowed length of our episode.
path = "../dqn_models" #The path to save our model to.
output_dim = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.001 #Rate to update target network toward primary network

targetOps = updateTargetGraph(trainables,tau)
myBuffer = experience_buffer()

start = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
annealing_steps = 10000. #How many steps of training to reduce startE to endE.
#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/annealing_steps

#create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0

#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

g = tf.Graph()
with g.as_default():
    agent = Double_DQN(output_dim)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        
        for i in range(num_episodes):
#             episodeBuffer = experience_buffer()
            #Reset environment and get first new observation
            s = env.reset()
            s = preprocess(s)
            d = False
            rAll = 0
            j = 0
            #The Q-Network
            while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
                j+=1
                #Choose an action by greedily (with e chance of random action) from the Q-network
                if np.random.rand(1) < e or total_steps < pre_train_steps:
                    a = np.random.randint(0,4)
                else:
                    a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0]
                s1,r,d = env.step(a)
                s1 = processState(s1)
                total_steps += 1
                episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.

                if total_steps > pre_train_steps:
                    if e > endE:
                        e -= stepDrop

                    if total_steps % (update_freq) == 0:
                        trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.
                        #Below we perform the Double-DQN update to the target Q-values
                        Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})
                        Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})
                        end_multiplier = -(trainBatch[:,4] - 1)
                        doubleQ = Q2[range(batch_size),Q1]
                        targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                        #Update the network with our target values.
                        _ = sess.run(mainQN.updateModel, \
                            feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})

                        updateTarget(targetOps,sess) #Update the target network toward the primary network.
                rAll += r
                s = s1

                if d == True:

                    break

            myBuffer.add(episodeBuffer.buffer)
            jList.append(j)
            rList.append(rAll)
            #Periodically save the model. 
            if i % 1000 == 0:
                saver.save(sess,path+'/model-'+str(i)+'.ckpt')
                print("Saved Model")
            if len(rList) % 10 == 0:
                print(total_steps,np.mean(rList[-10:]), e)
        saver.save(sess,path+'/model-'+str(i)+'.ckpt')
    print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")