## Import Packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from vizdoom import *


from skimage import transform 
from skimage.color import rgb2gray 

from collections import deque 

import time 
import random 
import warnings

warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


## Init Environment

In [2]:
def init_env():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1,0,0]
    shoot = [0,0,1]
    right = [0,1,0]
    
    possible_actions = [left, right, shoot]
    
    return game, possible_actions 

In [3]:
def test_env(test_eps = 10):
    """
    Perform random action and test env to ensure it works 
    """
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1,0,0]
    shoot = [0,0,1]
    right = [0,1,0]
    
    possible_actions = [left, right, shoot]
    
    for i in range(test_eps):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(possible_actions)
            print (action)
            
            reward = game.make_action(action)
            print ("\treward:", reward)
            
            time.sleep(0.02)
        
        #prints every episode 
        print ("Result:", game.get_total_reward())
        time.sleep(2)
    
    game.close()
    

In [4]:
#call init_env 
game, possible_actions = init_env()

## Preprocessing 

In [5]:
def preprocess_frame(frame):
    """
    Convert frames to grayscale, crop irrelevent parts, normalize pixels, 
    resize frame.
    
    Input is frame of size 210x160x3
    Returns frame of size 110x84x1 
    """
    #graying done by vizdoom
    #grayed = rgb2gray(frame)
    
    cropped = frame[30:-10, 30:-30]
    normed = cropped/255.0 
    resized = transform.resize(normed, [84,84])
    
    return resized #returns preprocessed frame 

def stack_frames(stacked_frames, frame, is_new_episode, stack_size=4):
    """
    takes a frame/state and preprocesses it, 
    if same episode:
        adds to the stack state 
    else new episode: 
        creates stacked state 
    
    where the stacked state is 4 stacked states(frames)
    returns stacked state where axis=1 is for different frames 
    """
    frame = preprocess_frame(frame)
    
    if not is_new_episode:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    else: 
        #init deque 
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) 
                                for i in range(stack_size)], maxlen=4)
        #new episode so same frame x4 
        for i in range(4): stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=2)

    return stacked_state, stacked_frames
  



In [6]:
stack_size = 4

#init frame stack
stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
frame = np.zeros((110,84), dtype=np.int)

#test helper functions 
sample_state, stacked_frames = stack_frames(stacked_frames, frame, True)
print(np.shape(sample_state), np.shape(stacked_frames))

(84, 84, 4) (4, 84, 84)


## Hyper Params

In [7]:
#training params  
total_episodes = 5000 #total num of episodes 
total_test_episodes = 10 #total num of episodes to test on 
max_steps = 5000 #max num of steps per episode 
bs = 64

#network params 
lr = 0.0002 #learning rate 
state_size = [84,84,4]  #4 84x84 frames 
action_size = game.get_available_buttons_size() # 3 actions

#Fixed Targets Params 
max_tau = 10000

#discount factor 
gamma = 0.95 

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay = 0.0001 #decay rate for eps 

#Recall Params
memory_size = 1000000
pretrained_len = 1000000 #number of init memories 

#Training Mode 
training=True 

#Env should be rendered or not 
episode_render = True 

#Prioritized Experience Sampling 
PER_e = 0.01  # Hyperparameter that we use to avoid some experiences that have 0 probability of being taken
PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
PER_b = 0.4  # importance-sampling, from initial value increasing to 1    
PER_b_increment_per_sampling = 0.001

## DQN: Simple Model 


In [8]:
class DQN():
    def __init__(self, state_size, action_size, learning_rate, name='DQN', training=training):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = learning_rate
        self.training = training
        with tf.variable_scope(name):
            #placeholders
            self.inputs = tf.placeholder(tf.float32,[None, *self.state_size])
            self.actions = tf.placeholder(tf.float32, [None, self.action_size])
            self.target_Q = tf.placeholder(tf.float32)
            
            #block 1 
            self.conv1 = tf.layers.conv2d(inputs=self.inputs, filters=32, 
                                          kernel_size=[8,8], 
                                          strides=(4,4), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn1 = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=self.training)
            self.elu1 = tf.nn.elu(self.bn1)
            
            #block 2
            self.conv2 = tf.layers.conv2d(inputs=self.elu1, filters=64, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn2 = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=self.training)
            self.elu2 = tf.nn.elu(self.bn2)
            
            #block 3
            self.conv3 = tf.layers.conv2d(inputs=self.elu2, filters=128, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn3 = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=self.training)
            self.elu3 = tf.nn.elu(self.bn3)
            
            #FC Block 
            self.flat = tf.layers.flatten(self.elu3)
            self.fc = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.out = tf.layers.dense(self.fc, units=self.action_size, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            #Q-Value prediction 
            self.pred_Q = tf.reduce_sum(tf.multiply(self.out, self.actions), axis=1)
            
            #Loss Function 
            self.loss = tf.reduce_mean(tf.square(self.target_Q-self.pred_Q))
            
            #Optimizer 
            self.optim = tf.train.RMSPropOptimizer(learning_rate=self.lr).minimize(self.loss)
           

## Dueling DQN 


In [11]:
class DDQN ():
    def __init__(self, state_size, action_size, learning_rate, name):
        self.state_size = state_size 
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name 
        
        #tf.variablescope makes it clear which network were working on 
        #recall parameter update function would require this 
        with tf.variable_scope(name):
            #placeholders
            self.inputs = tf.placeholder(tf.float32,[None, *self.state_size])
            self.actions = tf.placeholder(tf.float32, [None, self.action_size])
            self.target_Q = tf.placeholder(tf.float32)
            
            #block 1 
            self.conv1 = tf.layers.conv2d(inputs=self.inputs, filters=32, 
                                          kernel_size=[8,8], 
                                          strides=(4,4), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            #self.bn1 = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=self.training)
            self.elu1 = tf.nn.elu(self.conv1)
            
            #block 2
            self.conv2 = tf.layers.conv2d(inputs=self.elu1, filters=64, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            #self.bn2 = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=self.training)
            self.elu2 = tf.nn.elu(self.conv2)
            
            #block 3
            self.conv3 = tf.layers.conv2d(inputs=self.elu2, filters=128, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            #self.bn3 = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=self.training)
            self.elu3 = tf.nn.elu(self.conv3)
            
            #flatten 
            self.flat = tf.layers.flatten(self.elu3)     
            
            #FC Stream 1: V(s) 
            self.fc_v = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.value = tf.layers.dense(self.fc_v, units=1, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            #FC Stream 2 A(s,a)
            self.fc_a = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.advantage = tf.layers.dense(self.fc_a, units=self.action_size, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            
            #Q(s,a) estim by aggregation: Q(s,a) = V(s) + (A(s,a) - 1/|A| * sum A(s,a')) 
            self.out = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
            
            #Q-Value prediction 
            self.pred_Q = tf.reduce_sum(tf.multiply(self.out, self.actions), axis=1)
            
            #Loss Function 
            self.loss = tf.reduce_mean(tf.square(self.target_Q-self.pred_Q))
            
            #Optimizer 
            self.optim = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
           

In [12]:
tf.reset_default_graph()

num_actions = len(possible_actions)

#init the two networks 
#predictive network for updating weights 
DQNetwork = DDQN(state_size, num_actions, lr, name="DQNetwork")

#td target network with more stable weights 
TargetNetwork = DDQN(state_size, num_actions, lr, name="TargetNetwork")

## Experience Replay

In [33]:
#Sumtree is binary tree with leaves that contain the prioritiy, the index of a leave 
#is the corresponding index of an experience in a data array 

class SumTree():
        
    def __init__(self, capacity):
        """
        init nodes and data 
        """
        #number of experiences (i.e. number of leaf nodes)
        self.capacity = capacity
        
        #init tree, minus 1 for root node 
        self.tree = np.zeros(2*capacity-1) 
        
        #data for experiences 
        self.data = np.zeros((capacity,5))
    
        #data pointer for above array 
        self.data_index = 0 
    
    
    def add(self, priority, exp):
        """
        add exp Priority score to tree and data 
        """
        #determine index for tree 
        tree_index = self.data_index + self.capacity - 1
         
        #update data array 
        self.data[self.data_index,:] = [exp]
        
        #update leaf 
        self.update(tree_index, priority)
        
        #update data pointer by 1 
        self.data_index += 1 
        
        #overwrite if over capacity 
        if self.data_index >= self.capacity:
            self.data_index = 0 

    
    def update(self, tree_index, priority):
        """
        Update binary sum tree 
        """
        #calc change in priority
        delta = priority - self.tree[tree_index] 
        
        #update tree 
        self.tree[tree_index] = priority
        
        #propogate change up tree 
        while tree_index != 0: 
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += delta
    
    def get_leaf(self, v):
        """
        Get leaf index, priority value, and the experience within the data array.
        Given the priority value.
        """
        parent_index = 0
        
        while True: 
            left_child_index = parent_index * 2 + 1 
            right_child_index = left_child_index + 1 
            
            #if we reach end of tree, leaf index must be parent index 
            if left_child_index > len(self.tree):
                leaf_index = parent_index
                break 
            
            #otherwise search downwards for the higher priority node 
            else:
                if v<= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree_index[left_child_index]
                    parent_index = right_child_index
        
        data_index = leaf_index - self.capacity + 1
        
        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self):
        return self.tree[0]

In [36]:
#memory class stores (s,a,r,s_) in sumtree 

class Memory():
    
    def __init__(self, capacity, abs_err_upper = 0):
        self.tree = SumTree(capacity)
        self.abs_err = abs_err_upper
        
    def add(self, exp):
        #find max priority 
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        #check to ensure its not zero 
        if max_p == 0:
            max_p = self.abs_err
        
        #adds new exp with max_p
        self.tree.add(max_p, exp)
        
    def sample(self, n):
        #to contain minibatch 
        memory_batch = []
        b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)

        #determine priority ranges 
        priority_ranges = self.tree.total_priority / n
        
        # Here we increasing the PER_b each time we sample a new minibatch
        PER_b = np.min([1., PER_b + PER_b_increment_per_sampling])  # max = 1
        
        #max weight 
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-PER_b)
        
        for i in range(n):
            #sample value in range 
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)

            #find experience corresponding to value 
            index, priority, data = self.tree.get_leaf(value)
            
            #P(j)
            sampling_probabilities = priority / self.tree.total_priority
            
            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -PER_b)/ max_weight
                                   
            b_idx[i]= index
            
            experience = [data]
            
            memory_b.append(experience)
        
        return b_idx, memory_b, b_ISWeights
            
        
    def batch_update(self, tree_idx, abs_errors):
        """
        Update the priorities on tree 
        """
        abs_errors += PER_e  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.abs_err)
        ps = np.power(clipped_errors, PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

In [None]:
#activate and populate memory 
memory = Memory(memory_size)

#render env
game.new_episode()

#populate # of exp == pretrained_len
for i in range(pretrained_len):
    #initilize state for first step  
    if i == 0: 
        state = game.get_state().screen_buffer #init state 
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    #take a random action
    action = random.choice(possible_actions)
    
    #reward from chosen action 
    reward = game.make_action(action)
    
    #check if done 
    done = game.is_episode_finished()
    
    if not done:
        #get next state 
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
      
        #add memory
        exp = state, action, reward, next_state, done
        memory.add(exp)
        
        #update state
        state = next_state
        
    else: #were dead
        #update next state 
        next_state = np.zeros(state.shape)
        
        #add exp 
        exp = state, action, reward, next_state, done
        memory.add(exp)

        #new episode and restart 
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)


## Fixed TD Targets 

In [3]:
def update_target_weights(): 
    """
    returns list of weights mapping 
    """
    
    new_ws = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    old_ws = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")

    #op_holder will map new_ws to old_ws
    op_holder = [] 
    for old_w, new_w in zip(old_ws, new_ws):
        op_holder.append(new_w.assign(old_w))
    return op_holder

## DQN: Training

In [12]:
#Function to predict action based on epsilon greedy 

def explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions):
    #produce random number between 0 and 1 
    check = np.random.rand()
    explore_prob = min_eps + (epsilon-min_eps)*np.exp(-decay_rate*decay_step)
    
    #explore
    if explore_prob>check:
        #take random action 
        action = random.choice(possible_actions)  
        
    #exploit 
    else:
        #q vals predicted by network 
        Qs = sess.run(DQNetwork.out, feed_dict = {DQNetwork.inputs : state.reshape((1,*state.shape))})
        #choose action corresponding to best Q value 
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
    return action, explore_prob


In [14]:
#for saving 
saver = tf.train.Saver()

In [41]:
#training loop
if training: 
    with tf.Session() as sess:
        #init vars and decay step
        sess.run(tf.global_variables_initializer())
        decay_step = 0 
        
        #determine when to update target network weights 
        tau = 0 
        
        #update targets once before starting 
        update_target = update_target_weights()
        sess.run(update_target)
        
        #rewards for each episode 
        episodic_rewards = [] 
        
        #init doom  
        game.init()
        
        for episode in range(total_episodes):
            #init steps for episode 
            step = 0 
            
            #init observation
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frame = stack_frames(stacked_frames, state, True)
            
            #list for rewards collected in episode 
            rewards_list = []
            
            while step < max_steps:
                ################SAMPLING#######################    
                #increments 
                step += 1
                decay_step += 1
                tau += 1 
                
                #if step % 50 == 0: print (step)
                #choose action 
                action, explore_prob = explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions)

                #take action                 
                reward = game.make_action(action)inputs_: state.reshape((1, *state.shape))})
                
                #check if game is done 
                done = game.is_episode_finished()
                
                rewards_list.append(reward) 
                
                #if were not dead 
                if not done:
                    #get next state
                    next_state = game.get_state().screen_buffer
                    
                    #store transition after converting to proper format 
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    #add memory 
                    memory.add((state, action, reward, next_state, done))
                    
                    #update state
                    state = next_state

                #if were dead     
                else: 
                    #update next state 
                    next_state = np.zeros_like(state)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #add episodic rewards 
                    tot_reward = np.sum(rewards_list)
                    episodic_rewards.append((episode, tot_reward))
                    
                    #print episode summary 
                    print(  'Episode: {}'.format(episode),
                            'Total reward: {}'.format(tot_reward),
                            'Explore P: {:.4f}'.format(explore_prob),
                            'Training Loss {:.4f}'.format(loss))
                    
                    #end episode 
                    step = max_steps
                    
                    #add exp 
                    memory.add((state, action, reward, next_state, done))
        
                ################Learning#######################    
                #extract minibatch values 
                mini_batch = memory.sample(bs)
                states_batch = np.array([sample[0] for sample in mini_batch])
                actions_batch = np.array([sample[1] for sample in mini_batch])
                rewards_batch = np.array([sample[2] for sample in mini_batch])
                next_states_batch = np.array([sample[3] for sample in mini_batch])
                dones_batch = np.array([sample[4] for sample in mini_batch])
                
                #target qs to be set below 
                targ_qs_batch = []
                
                #get predicted Q's for each next state in the batch 
                #For double DQNs we use DQN to determine action for next state 
                dqn_next_qs_batch = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs : next_states_batch})
                
                #calculate q values for next state with target_network 
                targetnet_qs_batch = sess.run(TargetNetwork.out, feed_dict={TargetNetwork.inputs : next_states_batch})
                
                #determine if state is terminal and set value for target_q
                for i in range(bs):
                    final_state = dones_batch[i]
                    
                    if final_state:
                        targ_qs_batch.append(rewards_batch[i])
                    
                    else:   
                        action = np.argmax(dqn_next_qs_batch[i])
                        target_q = rewards_batch[i] + gamma * targetnet_qs_batch[i][action]
                        targ_qs_batch.append(target_q)
                    
                
                #convert to np array 
                target_q = np.array([i for i in targ_qs_batch])
                
                #determine loss 
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optim], feed_dict={DQNetwork.inputs: states_batch,
                                                                               DQNetwork.target_Q: targ_qs_batch,
                                                                               DQNetwork.actions: actions_batch})
                #if tau > max_tau update target network weights 
                if tau > max_tau:
                    update_targ = update_target_weights()
                    sess.run(update_targ)
                    tau = 0 
                    print ("Model Updated")
                    
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")              

KeyboardInterrupt: 

In [4]:
test_episodes = 50

with tf.Session() as sess:
    
    #setup env 
    game, possible_actions = init_env()
    
    #load the model 
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    total_score = 0 

    #run for each episode  
    for episode in range(test_episodes):
             
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
        while not game.is_episode_finished():
            #choose action corresponding to best q val 
            q_preds = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs:state.reshape((1,*state.shape))})            
            choice = np.argmax(q_preds)
            action = possible_actions[int(choice)]
            
            #take action
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:  #if game is done  
                break 
            
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state 
            
        print (""">>>>>>>>>>>>>>>>>>>> TESTING SUMMARY >>>>>>>>>>>>>>>>>>>>
                    Total Reward: {}""".format(game.get_total_reward())) 
        total_score += score
    
                 
    game.close()
    print ("****************************")
    print ("Average Reward over {} episodes:\t".format(test_episodes), total_score/float(test_episodes))
    

NameError: name 'tf' is not defined