# Deep Recurrent Q-Network for VizDoom


In [1]:
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim
%matplotlib inline

from helper2 import *

### Load the game environment

In [2]:
from vizdoom import *
a_size = 3 # Agent can move Left, Right, or Fire
image_size = 84

use_other_buffers = False #Wheter we use the depth buffer and label buffer instead of the screen buffer
use_RGB = False #Whether we use GRB or black and white
if use_other_buffers == True:
    image_chls = 2
else:
    if use_RGB == True:
        image_chls = 3
    else:
        image_chls = 1

#The Below code is related to setting up the Doom environment
game = DoomGame()
game.set_doom_scenario_path("defend_the_center.wad")  #This corresponds to the simple task we will pose our agent
game.load_config("defend_the_center.cfg")
game.set_doom_map("map01")
game.set_screen_resolution(ScreenResolution.RES_160X120)

if use_RGB == True:
    game.set_screen_format(ScreenFormat.RGB8)
else:
    game.set_screen_format(ScreenFormat.GRAY8)
    
game.set_render_hud(False)
game.set_render_crosshair(False)
game.set_render_weapon(True)
game.set_render_decals(False)
game.set_render_particles(False)

#Enable other usefull buffers for test purpose
game.set_depth_buffer_enabled(True)
game.set_automap_buffer_enabled(True)
game.set_labels_buffer_enabled(True)

game.add_available_button(Button.TURN_LEFT)
game.add_available_button(Button.TURN_RIGHT)
game.add_available_button(Button.ATTACK)
actions_list = np.identity(a_size,dtype=bool).tolist()
print(actions_list)

game.add_available_game_variable(GameVariable.AMMO2)
game.add_available_game_variable(GameVariable.POSITION_X)
game.add_available_game_variable(GameVariable.POSITION_Y)
game.set_episode_timeout(300)
game.set_episode_start_time(0)
game.set_window_visible(False)
game.set_sound_enabled(False)
#game.set_living_reward(-1)
game.set_mode(Mode.PLAYER)
game.init()

#End Doom set-up

env = game

[[True, False, False], [False, True, False], [False, False, True]]


### Implementing the network itself

In [3]:
class Qnetwork():
    def __init__(self,h_size,rnn_cell,myScope):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        
            
        self.scalarInput =  tf.placeholder(shape=[None,image_size * image_size * image_chls],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,image_size,image_size,image_chls])
        self.conv1 = slim.convolution2d( \
            inputs=self.imageIn,num_outputs=32,\
            kernel_size=[8,8],stride=[4,4],padding='VALID', \
            biases_initializer=None,scope=myScope+'_conv1')
        self.conv2 = slim.convolution2d( \
            inputs=self.conv1,num_outputs=64,\
            kernel_size=[4,4],stride=[2,2],padding='VALID', \
            biases_initializer=None,scope=myScope+'_conv2')
        self.conv3 = slim.convolution2d( \
            inputs=self.conv2,num_outputs=64,\
            kernel_size=[3,3],stride=[1,1],padding='VALID', \
            biases_initializer=None,scope=myScope+'_conv3')
        self.conv4 = slim.convolution2d( \
            inputs=self.conv3,num_outputs=h_size,\
            kernel_size=[7,7],stride=[1,1],padding='VALID', \
            biases_initializer=None,scope=myScope+'_conv4')
        
        self.trainLength = tf.placeholder(dtype=tf.int32)
        #We take the output from the final convolutional layer and send it to a recurrent layer.
        #The input must be reshaped into [batch x trace x units] for rnn processing, 
        #and then returned to [batch x units] when sent through the upper levles.
        self.batch_size = tf.placeholder(dtype=tf.int32,shape=[])
        self.convFlat = tf.reshape(slim.flatten(self.conv4),[self.batch_size,self.trainLength,h_size])
        self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)
        self.rnn,self.rnn_state = tf.nn.dynamic_rnn(\
                inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn')
        self.rnn = tf.reshape(self.rnn,shape=[-1,h_size])
        #The output from the recurrent player is then split into separate Value and Advantage streams
        self.streamA,self.streamV = tf.split(self.rnn,2,1)
        self.AW = tf.Variable(tf.random_normal([h_size//2,a_size]))
        self.VW = tf.Variable(tf.random_normal([h_size//2,1]))
        self.Advantage = tf.matmul(self.streamA,self.AW)
        self.Value = tf.matmul(self.streamV,self.VW)
        
        self.salience = tf.gradients(self.Advantage,self.imageIn)
        #Then combine them together to get our final Q-values.
        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        
        #In order to only propogate accurate gradients through the network, we will mask the first
        #half of the losses for each trace as per Lample & Chatlot 2016
        self.maskA = tf.zeros([self.batch_size,self.trainLength//2])
        self.maskB = tf.ones([self.batch_size,self.trainLength//2])
        self.mask = tf.concat([self.maskA,self.maskB],1)
        self.mask = tf.reshape(self.mask,[-1])
        self.loss = tf.reduce_mean(self.td_error * self.mask)
        
        #Learning rate is hard-coded to 0.0001
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        self.updateModel = self.trainer.minimize(self.loss)

### Experience Replay

These classes allow us to store experies and sample then randomly to train the network.
Episode buffer stores experiences for each individal episode.
Experience buffer stores entire episodes of experience, and sample() allows us to get training batches needed from the network.

In [263]:
class experience_buffer():
    def __init__(self, trace_length = 8, buffer_size = 1000):
        self.buffer = []
        self.buffer_size = buffer_size
        self.trace_length = trace_length

        #Initialize counters and buffers for prioritixed replay
        self.episode_index = 0
        self.alpha0 = 0.5 #Start-value of alpha, the prioritized replay probability exponent. Annealing is linear to 0.
        self.alpha = self.alpha0
        self.exp_prio_tuples = []
        
    def add(self,episode):
        #Compute the sampling priority of this episode in episode replay and update the sum of priorities
        episode = np.reshape(np.array(episode),[len(episode),6])
        self.td_error = episode[:, 5]
        self.priority = np.absolute(self.td_error) + 1e-9 #proportionnal priority
        #Append episode to the priority replay tuple list
        #Every experience in the episode has a tuple of the form:
        #(episode_index, experience_index, priority)
        for experience_index in range(self.trace_length-1, len(episode)-1):
            self.exp_prio_tuples.append((self.episode_index, experience_index, float(self.priority[experience_index])))
        self.episode_index += 1
        #Need something to avoid exp_prio_tuples to grow infinitly
        
        if len(self.buffer) + 1 >= self.buffer_size:
            self.buffer[0:(1+len(self.buffer))-self.buffer_size] = []
        self.buffer.append(episode)
            
    def sample(self,batch_size):
        #Ramdomly select a number of episodes egual to batch_size 
        sampled_episodes = random.sample(self.buffer,batch_size)
        #Within the selected episodes, randomly select an experience trace of length trace_length
        sampledTraces = []
        for episode in sampled_episodes:
            point = np.random.randint(0,len(episode)+1-self.trace_length)
            sampledTraces.append(episode[point:point+self.trace_length])
        sampledTraces = np.array(sampledTraces)
        
        return np.reshape(sampledTraces,[batch_size*self.trace_length,6])
    
    def PRsample(self,batch_size):
        #alpha annealing
        self.alpha = self.alpha0 - (self.episode_index * self.alpha0/num_episodes)
        #Compute the sampling probability distribution

        
        priorities_poweralpha = np.power([tup[2] for tup in self.exp_prio_tuples],myBuffer.alpha)
        sum_priorities_poweralpha = np.sum(priorities_poweralpha)
        sampling_probabilities = np.divide(priorities_poweralpha, sum_priorities_poweralpha)
        #Sample episodes using the computed distribution
        sampled_indexes = np.random.choice(len(self.exp_prio_tuples), batch_size, p = sampling_probabilities)
        sampled_tuples = [self.exp_prio_tuples[idx] for idx in sampled_indexes]
        ep_idx = [tup[0] for tup in sampled_tuples]
        print(ep_idx)
        exp_idx = [tup[1] for tup in sampled_tuples]
        print(exp_idx)
        sampledTraces = []
        for i in range(0,batch_size):
            sampled_ep = myBuffer.buffer[ep_idx[i]]
            print('len(sampled_ep) =' + str(len(sampled_ep)))
            sampled_ep = np.reshape(np.array(sampled_ep),[len(sampled_ep),6])
            sampled_trace = sampled_ep[exp_idx[i]+1-self.trace_length:exp_idx[i]+1]
            sampledTraces.append(sampled_trace)
        sampledTraces = np.array(sampledTraces)
        print(sampledTraces.shape)
        return np.reshape(sampledTraces,[batch_size*self.trace_length,6])
    
    def save(self, path2mdl):
        #Save only last 40 experiences in buffer otherwise ridiculously large file
        np.save(path2mdl + '/experienceBuffer.npy', self.buffer[-40:])
    
    def load(self, path2mdl):
        self.buffer = list(np.load(path2mdl + '/experienceBuffer.npy'))

### Training the network

In [264]:
#Setting the training parameters
batch_size = 4 #How many experience traces to use for each training step.
trace_length = 8 #How long each experience trace will be when training
update_freq = 5 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action

prioritized_replay = True
load_model = False #Whether to load a saved model.
if load_model == True:
    last_saved_ep = 3000 #This parameter has to be updated to the last checkpoint
else:
    last_saved_ep = 0
path2mdl = "../DeepRL-Agents-Results/drqn" #The path to save our model to.
path2center = "../DeepRL-Agents-Results/Center" #The path to save the Center information to
h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
max_epLength = 300 #The max allowed length of our episode.
anneling_steps = max_epLength*100 #How many steps of training to reduce startE to endE.
num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 300*4 #max_epLength*100 #How many steps of random actions before training begins. need to be a multiple of max_epLength
time_per_step =  0.025 #Length of each step used in gif creation
summaryLength = 100 #Number of epidoes to periodically save for analysis
tau = 0.001 #Rate at with the target network is update in regards to the main network

In [None]:
#


#We define the cells for the primary and target q-networks

tf.reset_default_graph()

cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
mainQN = Qnetwork(h_size,cell,'main')
targetQN = Qnetwork(h_size,cellT,'target')
trainables = tf.trainable_variables()
init = tf.global_variables_initializer()
targetOps = updateTargetGraph(trainables,tau)
saver = tf.train.Saver(max_to_keep=5)

#create lists to contain total rewards and steps per episode
jList = []
rList = []


#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/anneling_steps

#Make a path for our model to be saved in.
if not os.path.exists(path2mdl):
    os.makedirs(path2mdl)

with tf.Session() as sess:
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path2mdl)
        saver.restore(sess,ckpt.model_checkpoint_path)
        #Rough (over)estimate of the total number of steps since the beginning of training
        total_steps = last_saved_ep*300/update_freq
        myBuffer = experience_buffer(trace_length)
        myBuffer.load(path2mdl)
    else:
        #INITIALIZE VARIABLES AND MODEL


        myBuffer = experience_buffer(trace_length)

        total_steps = 0
        
        sess.run(init)
        #Write the first line of the master log-file for the Control Center
        with open(path2center + '/log.csv', 'w') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator = '\n')
            wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])   
        #Set the target network to be equal to the primary network.
        updateTarget(targetOps,sess)
    
    for i in range(last_saved_ep, num_episodes):
        #print(i)
        episodeBuffer = []
        
        #Reset environment and get first new observation
        env.new_episode()
        if use_other_buffers == True:
            st = game.get_state()
            dP = st.depth_buffer
            lP = st.labels_buffer
            sP = st.screen_buffer
            s = processBuffers(image_size, dP, lP, sP)
        else:
            sP = env.get_state().screen_buffer
            s = processImage(sP, image_size)
        d = False
        rAll = 0
        j = 0
        #Reset the recurrent layer's hidden state every episode
        state = (np.zeros([1,h_size]),np.zeros([1,h_size])) 
        #The Q-Network
        while j < max_epLength:
            
            if image_chls == 2:
                s_in = s[0:-image_size*image_size]
            else:
                s_in = s
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:
                #Only update the state of the RNN layer

                state1 = sess.run(mainQN.rnn_state,
                                  feed_dict={mainQN.scalarInput:[s_in/255.0],
                                             mainQN.trainLength:1, 
                                             mainQN.state_in:state,
                                             mainQN.batch_size:1})
                #Choose an action randomly
                a = np.random.randint(0,a_size)
                
            else:
                #Update the state of the RNN layer AND choose the best action
                a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],
                                     feed_dict={mainQN.scalarInput:[s_in/255.0],
                                                mainQN.trainLength:1,
                                                mainQN.state_in:state,
                                                mainQN.batch_size:1})
                a = a[0]
                
            r = env.make_action(actions_list[a])
            d = env.is_episode_finished()
            if d == False:
                if use_other_buffers == True:
                    st1 = game.get_state()
                    d1P = st1.depth_buffer
                    l1P = st1.labels_buffer
                    s1P = st1.screen_buffer
                    s1 = processBuffers(image_size, d1P, l1P, s1P)
                else:
                    s1P = env.get_state().screen_buffer
                    s1 = processImage(s1P, image_size)
            else:
                break

            total_steps += 1
            
            #Compute the td error to use for prioritized replay
            if image_chls == 2:
                s1_in = s1[0:-image_size*image_size]
            else:
                s1_in = s1
            
            Q1 = sess.run(mainQN.predict,
                          feed_dict={mainQN.scalarInput:[s1_in/255.0],
                                     mainQN.trainLength:1,
                                     mainQN.state_in:state1,
                                     mainQN.batch_size:1})
                    
            Q2 = sess.run(targetQN.Qout,
                          feed_dict={targetQN.scalarInput:[s1_in/255.0],
                                     targetQN.trainLength:1,
                                     targetQN.state_in:state1,
                                     targetQN.batch_size:1})
            
            #print('Q1.shape = ' + str(Q1.shape))
            #print('Q2.shape = ' + str(Q2.shape))        
            end_multiplier = -(d - 1)
            doubleQ = Q2[0, Q1]
            #print('doubleQ.shape = ' + str(doubleQ.shape))
            targetQ = r + (y*doubleQ * end_multiplier)
            #print('targetQ.shape = ' + str(targetQ.shape))
            currentaction = np.array(a, ndmin=1)
            #print('currentaction.shape = ' + str(currentaction.shape))

            td = sess.run(mainQN.td_error,
                     feed_dict={mainQN.scalarInput:[s_in/255.0],
                                mainQN.targetQ:targetQ,
                                mainQN.actions:currentaction,
                                mainQN.trainLength:1,
                                mainQN.state_in:state,
                                mainQN.batch_size:1})
            

            
            episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d,td]),[1,6]))
            
            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop
                
                #Update the networks at a cetain frequency (every n experiences)
                if total_steps % (update_freq) == 0:
                    updateTarget(targetOps,sess)
                    #Reset the recurrent layer's hidden state
                    state_train = (np.zeros([batch_size,h_size]),np.zeros([batch_size,h_size])) 
                    #Get a random batch of experiences.
                    if prioritized_replay == True:
                        trainBatch = myBuffer.PRsample(batch_size)
                    else:
                        trainBatch = myBuffer.sample(batch_size)

                    train_s = list(zip(trainBatch[:, 0]))
                    train_s1 = list(zip(trainBatch[:, 3]))
                    train_s = np.vstack(train_s)
                    train_s1 = np.vstack(train_s1)

                    if image_chls == 2:
                        train_s = train_s[:,0:-image_size*image_size]
                        train_s1 = train_s1[:,0:-image_size*image_size]

                    #Below we perform the Double-DQN update to the target Q-values
                    Q1 = sess.run(mainQN.predict,
                                  feed_dict={mainQN.scalarInput:np.vstack(train_s1/255.0),
                                             mainQN.trainLength:trace_length,
                                             mainQN.state_in:state_train,
                                             mainQN.batch_size:batch_size})
                    
                    Q2 = sess.run(targetQN.Qout,
                                  feed_dict={targetQN.scalarInput:np.vstack(train_s1/255.0),
                                             targetQN.trainLength:trace_length,
                                             targetQN.state_in:state_train,
                                             targetQN.batch_size:batch_size})
                    
                    end_multiplier = -(trainBatch[:,4] - 1)
                    doubleQ = Q2[range(batch_size*trace_length),Q1]
                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    
                    #Update the network with our target values.
                    sess.run(mainQN.updateModel,
                             feed_dict={mainQN.scalarInput:np.vstack(train_s/255.0),
                                        mainQN.targetQ:targetQ,
                                        mainQN.actions:trainBatch[:,1],
                                        mainQN.trainLength:trace_length,
                                        mainQN.state_in:state_train,
                                        mainQN.batch_size:batch_size})
            rAll += r
            s = s1
            state = state1
            
            if use_other_buffers == True:
                lP = l1P
                dP = d1P
                sP = s1P
            else:
                sP = s1P
            

            if d == True:

                break

        #Add the episode to the experience buffer
        bufferArray = np.array(episodeBuffer)
        episodeBuffer = list(zip(bufferArray))
        myBuffer.add(episodeBuffer)
        jList.append(j)
        rList.append(rAll)

        #Periodically save the model. 
        if i % 1000 == 0 and i != last_saved_ep:
            saver.save(sess, path2mdl + '/model-'+str(i)+'.cptk', global_step = i)
            myBuffer.save(path2mdl)
            print ("Saved Model")
        if len(rList) % summaryLength == 0 and len(rList) != 0:
            #print (total_steps,np.mean(rList[-summaryLength:]), e)
            saveToCenter(i,rList,jList,
                         np.reshape(np.array(episodeBuffer),[len(episodeBuffer),6]),
                         summaryLength,
                         h_size,sess,mainQN,time_per_step,
                         image_size, image_chls, image_chls,
                         path2center)
    saver.save(sess,path2mdl + '/model-'+str(i)+'.cptk')

Target Set Success
Target Set Success
[1, 2, 2, 2]
[66, 95, 32, 28]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[1, 3, 1, 3]
[70, 19, 273, 76]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[1, 2, 1, 3]
[46, 117, 287, 91]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 0, 2, 1]
[77, 103, 14, 38]
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =270
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 3, 3, 1]
[57, 160, 133, 160]
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 3, 1, 1]
[129, 142, 247, 141]
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 2, 2, 3]
[14, 174, 166, 19]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =27

[4, 2, 1, 4]
[218, 69, 247, 92]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 1, 1, 3]
[225, 141, 232, 168]
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[1, 0, 0, 2]
[188, 266, 200, 121]
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[1, 2, 3, 1]
[71, 151, 47, 269]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[4, 1, 2, 0]
[16, 37, 226, 150]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[0, 1, 1, 1]
[40, 85, 247, 52]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[4, 0, 1, 1]
[83, 201, 251, 60]
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
T

len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[4, 2, 4, 2]
[112, 50, 239, 86]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[2, 3, 4, 3]
[176, 32, 92, 201]
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[0, 3, 2, 2]
[130, 22, 43, 157]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[0, 1, 5, 3]
[203, 161, 129, 281]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[4, 0, 0, 2]
[64, 171, 81, 97]
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[1, 2, 0, 1]
[281, 82, 228, 280]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =288
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 4, 2, 3]
[7

len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 2, 3, 3]
[120, 244, 176, 24]
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[6, 3, 1, 3]
[123, 117, 25, 124]
len(sampled_ep) =278
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 2, 2, 0]
[163, 21, 166, 261]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[0, 3, 1, 2]
[22, 146, 92, 171]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[6, 3, 3, 2]
[175, 231, 276, 171]
len(sampled_ep) =278
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[1, 0, 1, 1]
[283, 30, 266, 134]
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 2, 3, 3]
[93, 20, 47, 161]
len(sampled_ep) =270
l

(4, 8, 6)
Target Set Success
[1, 2, 0, 4]
[277, 264, 38, 114]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =288
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[0, 0, 1, 1]
[101, 61, 154, 214]
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[7, 6, 1, 0]
[122, 274, 149, 58]
len(sampled_ep) =299
len(sampled_ep) =278
len(sampled_ep) =299
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[6, 2, 1, 3]
[227, 208, 138, 261]
len(sampled_ep) =278
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[0, 0, 0, 3]
[182, 196, 220, 60]
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[7, 2, 0, 4]
[29, 10, 277, 147]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =288
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[2, 0, 1, 3]
[80, 64, 291, 152]
len(sampled_ep) =270
len(sampled_ep) =288
len(sampled_ep) =299


[241, 152, 203, 140]
len(sampled_ep) =288
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 1, 3, 2]
[49, 50, 157, 210]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[3, 3, 4, 0]
[151, 45, 15, 168]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[3, 3, 2, 0]
[142, 123, 207, 212]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[4, 2, 1, 0]
[105, 120, 188, 209]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[0, 3, 3, 1]
[152, 156, 209, 131]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[8, 2, 2, 1]
[47, 256, 73, 11]
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =270
len(sampled_ep) =299
(4, 8, 6)
Target Set

len(sampled_ep) =288
len(sampled_ep) =278
len(sampled_ep) =299
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[0, 3, 2, 1]
[101, 174, 67, 85]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[1, 1, 8, 3]
[209, 66, 195, 112]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[1, 2, 4, 1]
[117, 23, 61, 140]
len(sampled_ep) =299
len(sampled_ep) =270
len(sampled_ep) =299
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[9, 7, 9, 0]
[161, 91, 217, 133]
len(sampled_ep) =286
len(sampled_ep) =299
len(sampled_ep) =286
len(sampled_ep) =288
(4, 8, 6)
Target Set Success
[1, 1, 1, 2]
[253, 177, 24, 77]
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =299
len(sampled_ep) =270
(4, 8, 6)
Target Set Success
[0, 1, 0, 7]
[89, 236, 26, 219]
len(sampled_ep) =288
len(sampled_ep) =299
len(sampled_ep) =288
len(sampled_ep) =299
(4, 8, 6)
Target Set Success
[3, 3, 8, 3]
[1

In [64]:
print(type(myBuffer))
print(type(myBuffer.buffer))
print(len(myBuffer.buffer))
#myBuffer.save(path2mdl)
#myBuffer = experience_buffer()
#myBuffer.load(path2mdl)
print(type(myBuffer.buffer))
print(len(myBuffer.buffer))
print(myBuffer.buffer[98].shape)

<class '__main__.experience_buffer'>
<class 'list'>
100
<class 'list'>
100


AttributeError: 'list' object has no attribute 'shape'

In [11]:
trainBatch = myBuffer.PRsample(batch_size,trace_length, i, num_episodes)

ValueError: operands could not be broadcast together with shapes (270,1,1,6) (7056,) 

In [235]:
print(td)
episode = episodeBuffer
episode = np.reshape(np.array(episode),[len(episode),6])
print(episode.shape)
print(episode[:, 5].shape)
td_error = episode[:, 5]
priority = np.absolute(td_error) + 1e-9 #proportionnal priority
print(priority.shape)
#Append episode to the priority replay tuple list
#Every experience in the episode has a tuple of the form:
#(episode_index, experience_index, priority)
episode_index = 0

exp_prio_tuples = []
for experience_index in range(trace_length-1, len(episode[:, 5])):
    tup = (episode_index, experience_index, float(priority[experience_index]))
    exp_prio_tuples.append(tup)


print(exp_prio_tuples[2:10])
print(exp_prio_tuples[2])
print(exp_prio_tuples[2][2])

priorities_poweralpha = np.power([tup[2] for tup in exp_prio_tuples],myBuffer.alpha)
print(myBuffer.alpha)
print(priorities_poweralpha.shape)
sum_priorities_poweralpha = np.sum(priorities_poweralpha)
print(sum_priorities_poweralpha)
sampling_probabilities = np.divide(priorities_poweralpha, sum_priorities_poweralpha)
print(sampling_probabilities.shape)
sampled_indexes = np.random.choice(len(exp_prio_tuples), batch_size, p = sampling_probabilities)
print(sampled_indexes)

sampled_tuples = [exp_prio_tuples[idx] for idx in sampled_indexes]
ep_idx = [tup[0] for tup in sampled_tuples]
print(ep_idx)
exp_idx = [tup[1] for tup in sampled_tuples]
print(exp_idx)
sampledTraces = []
for i in range(0,batch_size):
    sampled_ep = myBuffer.buffer[ep_idx[i]]
    sampled_ep = np.reshape(np.array(sampled_ep),[len(sampled_ep),6])
    sampled_exp = sampled_ep[exp_idx[i]-(trace_length):exp_idx[i]] 
    sampledTraces.append(sampled_exp)
sampledTraces = np.array(sampledTraces)
print(sampledTraces.shape)
result = np.reshape(sampledTraces,[batch_size*trace_length,6])

[ 0.00233381]
(110, 6)
(110,)
(110,)
[(0, 9, 0.006071518640965223), (0, 10, 0.0048619285225868225), (0, 11, 0.0002145733596989885), (0, 12, 0.003930300939828157), (0, 13, 0.014522322453558445), (0, 14, 0.01503776665776968), (0, 15, 0.0023576875682920218), (0, 16, 0.015782633796334267)]
(0, 9, 0.006071518640965223)
0.006071518640965223
0.4998
(103,)
6.90191820679
(103,)
[37 34 84 55]
[0, 0, 0, 0]
[44, 41, 91, 62]
(4, 8, 6)


In [225]:
print(sampledTraces[3].shape)

(8, 6)
