## Import Packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from vizdoom import *


from skimage import transform 
from skimage.color import rgb2gray 

from collections import deque 

import time 
import random 
import warnings

warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


## Init Environment

In [2]:
def init_env():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1,0,0]
    shoot = [0,0,1]
    right = [0,1,0]
    
    possible_actions = [left, right, shoot]
    
    return game, possible_actions 

In [3]:
def test_env(test_eps = 10):
    """
    Perform random action and test env to ensure it works 
    """
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1,0,0]
    shoot = [0,0,1]
    right = [0,1,0]
    
    possible_actions = [left, right, shoot]
    
    for i in range(test_eps):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(possible_actions)
            print (action)
            
            reward = game.make_action(action)
            print ("\treward:", reward)
            
            time.sleep(0.02)
        
        #prints every episode 
        print ("Result:", game.get_total_reward())
        time.sleep(2)
    
    game.close()
    

In [10]:
test_env()

[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -6.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -6.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	rewar

[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -6.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -6.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 1, 0]
	reward: -1.0
[0, 0, 1]
	reward: -1.0
[1, 0, 0]
	reward: -1.0
[1, 0, 0]
	rewar

ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

In [4]:
#call init_env 
game, possible_actions = init_env()

## Preprocessing 

In [36]:
def preprocess_frame(frame):
    """
    Convert frames to grayscale, crop irrelevent parts, normalize pixels, 
    resize frame.
    
    Input is frame of size 210x160x3
    Returns frame of size 84x84x1 
    """
    #graying done by vizdoom
    #grayed = rgb2gray(frame)
    
    cropped = frame[30:-10, 30:-30]
    normed = cropped/255.0 
    resized = transform.resize(normed, [84,84])
    
    return resized #returns preprocessed frame 

stack_size = 4
def stack_frames(stacked_frames, frame, is_new_episode, stack_size=stack_size):
    """
    takes a frame/state and preprocesses it, 
    if same episode:
        adds to the stack state 
    else new episode: 
        creates stacked state 
    
    where the stacked state is 4 stacked states(frames)
    returns stacked state where axis=1 is for different frames 
    """
    frame = preprocess_frame(frame)
    
    if not is_new_episode:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    else: 
        #init deque 
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) 
                                for i in range(stack_size)], maxlen=4)
        #new episode so same frame x4 
        for i in range(4): stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=2)

    return stacked_state, stacked_frames
  

In [37]:
#init frame stack
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
frame = np.zeros((84,84), dtype=np.int)

#test helper functions 
sample_state, stacked_frames = stack_frames(stacked_frames, frame, True)
print(np.shape(sample_state), np.shape(stacked_frames))

(84, 84, 4) (4, 84, 84)


## Hyper Params

In [38]:
#training params  
total_episodes = 500 #total num of episodes 
total_test_episodes = 10 #total num of episodes to test on 
max_steps = 100 #max num of steps per episode 
bs = 64

#network params 
lr = 0.0002 #learning rate 
state_size = [84,84,4]  #4 84x84 frames 
action_size = game.get_available_buttons_size() # 3 actions

#discount factor 
gamma = 0.95 

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay_rate = 0.0001 #decay rate for eps 

#Recall Params
memory_size = 1000000
pretrained_len = bs #number of init memories 

#Training Mode 
training=True 

#Env should be rendered or not 
episode_render = True 

## DQN: Model 


In [39]:
class DQN():
    def __init__(self, state_size, action_size, learning_rate, name='DQN', training=training):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = learning_rate
        self.training = training
        with tf.variable_scope(name):
            #placeholders
            self.inputs = tf.placeholder(tf.float32,[None, *self.state_size])
            self.actions = tf.placeholder(tf.float32, [None, self.action_size])
            self.target_Q = tf.placeholder(tf.float32)
            
            #block 1 
            self.conv1 = tf.layers.conv2d(inputs=self.inputs, filters=32, 
                                          kernel_size=[8,8], 
                                          strides=(4,4), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn1 = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=self.training)
            self.elu1 = tf.nn.elu(self.bn1)
            
            #block 2
            self.conv2 = tf.layers.conv2d(inputs=self.elu1, filters=64, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn2 = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=self.training)
            self.elu2 = tf.nn.elu(self.bn2)
            
            #block 3
            self.conv3 = tf.layers.conv2d(inputs=self.elu2, filters=128, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn3 = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=self.training)
            self.elu3 = tf.nn.elu(self.bn3)
            
            #FC Block 
            self.flat = tf.layers.flatten(self.elu3)
            self.fc = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.out = tf.layers.dense(self.fc, units=self.action_size, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            #Q-Value prediction 
            self.pred_Q = tf.reduce_sum(tf.multiply(self.out, self.actions), axis=1)
            
            #Loss Function 
            self.loss = tf.reduce_mean(tf.square(self.target_Q-self.pred_Q))
            
            #Optimizer 
            self.optim = tf.train.RMSPropOptimizer(learning_rate=self.lr).minimize(self.loss)
           

In [40]:
tf.reset_default_graph()

#init
num_actions = len(possible_actions)
DQNetwork = DQN(state_size, num_actions, lr)

## Experience Replay

In [41]:
#memory class creates and manages deque 

class Memory():
    def __init__(self, mem_limit):
        self.cache = deque(maxlen=mem_limit)
    
    def add(self, exp):
        self.cache.append(exp)
    
    def sample(self, sample_size):
        sampling_ind = np.random.choice(np.arange(len(self.cache)), size=sample_size, replace=False)
        return [self.cache[i] for i in sampling_ind]

In [42]:
#activate and populate memory 

memory = Memory(mem_limit=memory_size)

import pdb 

game.new_episode()

for i in range(pretrained_len):
    #pdb.set_trace()
    #initilize state for first step  
    if i == 0: 
        state = game.get_state().screen_buffer #init state 
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    #take a random action
    action = random.choice(possible_actions)
    
    #reward from chosen action 
    reward = game.make_action(action)
    
    #check if done 
    done = game.is_episode_finished()
    
    if not done:
        #get next state 
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        #add memory 
        memory.add((state, action, reward, next_state, done))
        #update state
        state = next_state
        
    else: #were dead
        #update next state 
        next_state = np.zeros((84,84), dtype=np.int)

        #add exp 
        memory.add((state, action, reward, next_state, done))

        #new episode and restart 
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)


## DQN: Training

In [43]:
#Function to predict action based on epsilon greedy 

def explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions):
    #produce random number between 0 and 1 
    check = np.random.rand()
    explore_prob = min_eps + (epsilon-min_eps)*np.exp(-decay_rate*decay_step)
    
    #explore
    if explore_prob>check:
        #take random action 
        action = random.choice(possible_actions)  
        
    #exploit 
    else:
        #q vals predicted by network 
        Qs = sess.run(DQNetwork.out, feed_dict = {DQNetwork.inputs : state.reshape((1,*state.shape))})
        #choose action corresponding to best Q value 
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
    return action, explore_prob


In [44]:
#for saving 
saver = tf.train.Saver()

In [48]:
#training loop
if training: 
    with tf.Session() as sess:
        #init vars and decay step
        sess.run(tf.global_variables_initializer())
        decay_step = 0 
        
        #rewards for each episode 
        episodic_rewards = [] 
        
        #init doom  
        game.init()
        
        for episode in range(total_episodes):
            #init steps for episode 
            step = 0 
            
            #init observation
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frame = stack_frames(stacked_frames, state, True)
            
            #list for rewards collected in episode 
            rewards_list = []
            
            while step < max_steps:
                ################SAMPLING#######################    
                #increments 
                step += 1
                decay_step += 1
                #if step % 50 == 0: print (step)
                #choose action 
                action, explore_prob = explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions)

                #take action                 
                reward = game.make_action(action)
                
                #check if game is done 
                done = game.is_episode_finished()
                
                rewards_list.append(reward) 
                
                #if were not dead 
                if not done:
                    #get next state
                    next_state = game.get_state().screen_buffer
                    
                    #store transition after converting to proper format 
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                    #add memory 
                    memory.add((state, action, reward, next_state, done))
                    
                    #update state
                    state = next_state

                #if were dead     
                else: 
                    #update next state 
                    next_state = np.zeros((84,84), dtype=np.int)

                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #add episodic rewards 
                    tot_reward = np.sum(rewards_list)
                    episodic_rewards.append((episode, tot_reward))
                    
                    #print episode summary 
                    print(  'Episode: {}'.format(episode),
                            'Total reward: {}'.format(tot_reward),
                            'Explore P: {:.4f}'.format(explore_prob),
                            'Training Loss {:.4f}'.format(loss))
                    
                    #end episode 
                    step = max_steps
                    
                    #add exp 
                    memory.add((state, action, reward, next_state, done))
        
                ################Learning#######################    
                #extract minibatch values 
                mini_batch = memory.sample(bs)
                states_batch = np.array([sample[0] for sample in mini_batch])
                actions_batch = np.array([sample[1] for sample in mini_batch])
                rewards_batch = np.array([sample[2] for sample in mini_batch])
                next_states_batch = np.array([sample[3] for sample in mini_batch])
                dones_batch = np.array([sample[4] for sample in mini_batch])
                
                #target qs to be set below 
                targ_qs_batch = []
                
                #get predicted Q's for each next state 
                next_qs_batch = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs : next_states_batch})
                
                #determine if state is terminal and set value for target_q
                for i in range(bs):
                    final_state = dones_batch[i]
                    
                    if final_state:
                        targ_qs_batch.append(rewards_batch[i])
                    
                    else:   
                        target_q = rewards_batch[i] + gamma * np.max(next_qs_batch[i])
                        targ_qs_batch.append(target_q)
                    
                
                #convert to np array 
                target_q = np.array([i for i in targ_qs_batch])
                
                #determine loss 
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optim], feed_dict={DQNetwork.inputs: states_batch,
                                                                               DQNetwork.target_Q: targ_qs_batch,
                                                                               DQNetwork.actions: actions_batch})
                
                
                #perform gradient update 
                
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print ("Episode: \t", episode)
                
                print("Model Saved")              

FileDoesNotExistException: File "basic.wad" does not exist.

In [42]:
test_episodes = 1 

with tf.Session() as sess:
    
    #setup env 
    game, possible_actions = init_env()
    
    #load the model 
    saver.restore(sess, "./models/model.ckpt")
    game.init()

    #run for each episode  
    for episode in range(test_episodes):
             
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
        while not game.is_episode_finished():
            #choose action corresponding to best q val 
            q_preds = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs:state})            
            choice = np.argmax(q_preds)
            action = possible_actions[int(choice)]
            
            #take action
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:  #if game is done 
                break 
            
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state 
            
        print (""">>>>>>>>>>>>>>>>>>>> TESTING SUMMARY >>>>>>>>>>>>>>>>>>>>
                    Total Reward: {}""".format(game.get_total_reward())) 
            

    #env.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt

                >>>>>>>>>>>>>>>>>>>> TESTING SUMMARY >>>>>>>>>>>>>>>>>>>>
                Reward: 0.0
