## Import Packages

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from vizdoom import *


from skimage import transform 
from skimage.color import rgb2gray 

from collections import deque 

import time 
import random 
import warnings

warnings.filterwarnings('ignore')

## Init Environment

In [12]:
def init_env():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1,0,0]
    shoot = [0,0,1]
    right = [0,1,0]
    
    possible_actions = [left, right, shoot]
    
    return game, possible_actions 

In [14]:
#call init_env 
game, possible_actions = init_env()

## Preprocessing 

In [15]:
def preprocess_frame(frame):
    """
    Convert frames to grayscale, crop irrelevent parts, normalize pixels, 
    resize frame.
    
    Input is frame of size 210x160x3
    Returns frame of size 110x84x1 
    """
    #graying done by vizdoom
    #grayed = rgb2gray(frame)
    
    cropped = frame[30:-10, 30:-30]
    normed = cropped/255.0 
    resized = transform.resize(normed, [84,84])
    
    return resized #returns preprocessed frame 

def stack_frames(stacked_frames, frame, is_new_episode, stack_size=4):
    """
    takes a frame/state and preprocesses it, 
    if same episode:
        adds to the stack state 
    else new episode: 
        creates stacked state 
    
    where the stacked state is 4 stacked states(frames)
    returns stacked state where axis=1 is for different frames 
    """
    frame = preprocess_frame(frame)
    
    if not is_new_episode:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    else: 
        #init deque 
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) 
                                for i in range(stack_size)], maxlen=4)
        #new episode so same frame x4 
        for i in range(4): stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=2)

    return stacked_state, stacked_frames
  



In [25]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)

    return discounted_episode_rewards

In [16]:
stack_size = 4

#init frame stack
stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
frame = np.zeros((110,84), dtype=np.int)

#test helper functions 
sample_state, stacked_frames = stack_frames(stacked_frames, frame, True)
print(np.shape(sample_state), np.shape(stacked_frames))

(84, 84, 4) (4, 84, 84)


## Hyper Params

In [19]:
#training params  
total_episodes = 5000 #total num of episodes 
total_test_episodes = 10 #total num of episodes to test on 
max_steps = 5000 #max num of steps per episode 
bs = 64

#network params 
lr = 0.0002 #learning rate 
state_size = [84,84,4]  #4 84x84 frames 
action_size = game.get_available_buttons_size() # 3 actions

#Fixed Targets Params 
max_tau = 10000

#discount factor 
gamma = 0.95 

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay_rate = 0.0001 #decay rate for eps 

#Recall Params
memory_size = 1000000
pretrained_len = bs #number of init memories 

#Training Mode 
training=True 

#Env should be rendered or not 
episode_render = True 



## REINFORCE: Simple Model 


In [20]:
class Policy_Gradient():
    def __init__(self, state_size, action_size, learning_rate, name='DQN', training=training):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = learning_rate
        self.training = training
        
        with tf.variable_scope(name):
            #placeholders
            self.inputs = tf.placeholder(tf.float32,[None, *self.state_size])
            self.actions = tf.placeholder(tf.float32, [None, self.action_size])
            self.target_Q = tf.placeholder(tf.float32)
            self.discounted_eps_rewards = tf.placeholder(tf.float32, [None,])


            #block 1 
            self.conv1 = tf.layers.conv2d(inputs=self.inputs, filters=32, 
                                          kernel_size=[8,8], 
                                          strides=(4,4), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn1 = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=self.training)
            self.elu1 = tf.nn.elu(self.bn1)
            
            #block 2
            self.conv2 = tf.layers.conv2d(inputs=self.elu1, filters=64, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn2 = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=self.training)
            self.elu2 = tf.nn.elu(self.bn2)
            
            #block 3
            self.conv3 = tf.layers.conv2d(inputs=self.elu2, filters=128, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn3 = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=self.training)
            self.elu3 = tf.nn.elu(self.bn3)
            
            #FC Block 
            self.flat = tf.layers.flatten(self.elu3)
            self.fc = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.logits = tf.layers.dense(self.fc, units=3, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            #dealing with logits 
            self.softmax = tf.nn.softmax(self.logits)
            
            self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits,
                                                                          labels=self.action_size)
            
            #Loss Function 
            self.loss = tf.reduce_mean(self.neg_log_prob*self.discounted_eps_rewards)
            
            #Optimizer 
            self.optim = tf.train.RMSPropOptimizer(learning_rate=self.lr).minimize(self.loss)
           

In [10]:
tf.reset_default_graph()

num_actions = len(possible_actions)

#init the two networks 
#predictive network for updating weights 
REINFORCE = Policy_Gradient(state_size, num_actions, lr, name="PolicyGradient")


## Create Batch for Training

In [24]:
def create_batch(batch_size, stacked_frames):
    """
    Create a batch of size bs and return list of states, actions, rewards for batch, 
    discounted rewards, and the final episode 
    """
    state_list, actions_list, rewards_of_eps, rewards_of_batch, discounted_rewards = [],[],[],[],[]
    episode_number = 1 
    
    #launch new episode 
    game.new_episode()
    
    #get new state 
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    while True: 
        #calculate output of nn
        action_prob_dist = sess.run(REINFORCE.softmax, 
                                    feed_dict={REINFORCE.inputs:state.reshape(1, *state_size)})
        
        #choose stochastically 
        choice = np.random.choice(range(action_prob_dist.shape[1]), p=action_prob_dist.ravel())
        action = possible_actions[choice]
        
        #take action and check if finished
        reward = game.make_action(action)
        done = game.is_episode_finished()
        
        #store results 
        state_list.append(state)
        actions_list.append(action)
        rewards_of_eps.append(reward)
        
        if not done:
            #continue to next state 
            next_state = game.get_state.screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, state, False)
            state = next_state
       
        else:                         
            #append info to lists     
            rewards_of_batch.append(rewards_of_episode)
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode))
            
            #reset stores 
            rewards_of_eps = []
            
            episode_number +=1
            
            #new episode 
            game.new_episode()
            
            #state and stack 
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, True)
            state = next_state 
    
    return np.stack(np.array(states)), np.stack(np.array(actions)),np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_number  

## Training: Policy Gradient

In [31]:
# Keep track of all rewards total for each batch
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
mean_reward_total = []
epoch = 1
average_reward = []

#saver
#saver = tf.train.Saver()

#training loop
if training: 
    with tf.Session() as sess:
        for eps in range(total_episodes):
            # Gather training data
            states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, nb_episodes_mb = create_batch(bs, stacked_frames)

            ### These part is used for analytics
            # Calculate the total reward ot the batch
            total_reward_of_that_batch = np.sum(rewards_of_batch)
            allRewards.append(total_reward_of_that_batch)

            # Calculate the mean reward of the batch
            # Total rewards of batch / nb episodes in that batch
            mean_reward_of_that_batch = np.divide(total_reward_of_that_batch, nb_episodes_mb)
            mean_reward_total.append(mean_reward_of_that_batch)
            
            # Calculate the average reward of all training
            # mean_reward_of_that_batch / epoch
            average_reward_of_all_training = np.divide(np.sum(mean_reward_total), epoch)

            # Calculate maximum reward recorded 
            maximumRewardRecorded = np.amax(allRewards)

            print("==========================================")
            print("Epoch: ", epoch, "/", num_epochs)
            print("-----------")
            print("Number of training episodes: {}".format(nb_episodes_mb))
            print("Total reward: {}".format(total_reward_of_that_batch, nb_episodes_mb))
            print("Mean Reward of that batch {}".format(mean_reward_of_that_batch))
            print("Average Reward of all training: {}".format(average_reward_of_all_training))
            print("Max reward for a batch so far: {}".format(maximumRewardRecorded))

            # Feedforward, gradient and backpropagation
            loss_, _ = sess.run([REINFORCE.loss, REINFORCE.optim], 
                                feed_dict={REINFORCE.inputs: states_mb.reshape((len(states_mb), 84,84,4)),
                                           REINFORCE.actions: actions_mb,
                            REINFORCE.discounted_episode_rewards_: discounted_rewards_mb 
                                                                        })

            print("Training Loss: {}".format(loss_))


            if eps % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")              

ViZDoomErrorException: Unexpected ViZDoom instance crash.

In [4]:
test_episodes = 50

with tf.Session() as sess:
    
    #setup env 
    game, possible_actions = init_env()
    
    #load the model 
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    total_score = 0 

    #run for each episode  
    for episode in range(test_episodes):
             
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
        while not game.is_episode_finished():
            #choose action corresponding to best q val 
            q_preds = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs:state.reshape((1,*state.shape))})            
            choice = np.argmax(q_preds)
            action = possible_actions[int(choice)]
            
            #take action
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:  #if game is done  
                break 
            
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state 
            
        print (""">>>>>>>>>>>>>>>>>>>> TESTING SUMMARY >>>>>>>>>>>>>>>>>>>>
                    Total Reward: {}""".format(game.get_total_reward())) 
        total_score += score
    
                 
    game.close()
    print ("****************************")
    print ("Average Reward over {} episodes:\t".format(test_episodes), total_score/float(test_episodes))
    

NameError: name 'tf' is not defined