In [30]:
import tensorflow as tf             #Deep Learning library
import numpy as np                  #Handle matrices
from vizdoom import *               #Doom Enviroment

from skimage import transform       #Helps to preprocess the frames
import time                         #Handles time calculationsfundood

import matplotlib.pyplot as plt     #Display graphs

from collections import deque       #Ordered collection with ends

import random
import warnings                     #This will ignore all the warning messages that are normally printed during the training,
                                    #caused by skiimage
warnings.filterwarnings('ignore')

In [31]:
#Creates the Enviroment
def create_enviroment():
    game = DoomGame()

    #Load the correct configuration 
    game.load_config("/mnt/fbc0d8ef-f929-44c3-b2ee-3c536d9e0645/QLearning/scenarios/basic.cfg")
    #load the right scenario(in this case the basic scenario)
    game.set_doom_scenario_path("/mnt/fbc0d8ef-f929-44c3-b2ee-3c536d9e0645/QLearning/scenarios/basic.wad")
    
    game.init()
    
    # Here our possible actions
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

"""
Use Random Actions to Test the Enviroment
"""
def test_environment():
    game = DoomGame()
    game.load_config("/mnt/fbc0d8ef-f929-44c3-b2ee-3c536d9e0645/QLearning/scenarios/basic.cfg")
    game.set_doom_scenario_path("/mnt/fbc0d8ef-f929-44c3-b2ee-3c536d9e0645/QLearning/scenarios/basic.wad")
    game.init()
    
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]

    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            print(action)
            reward = game.make_action(action)
            print ("\treward:", reward)
            time.sleep(0.02)
        print ("Result:", game.get_total_reward())
        time.sleep(2)
    game.close()

In [32]:
#Create the Enviroment and attain a list of possible actions to take
game, possible_actions = create_enviroment()

In [33]:
#Preprocessing Function
#The frame is preprocessed to reduce the complexity of our states and to reduce the computation time needed to complete
#the training
def preprocess_frame(frame):
    #Greyscale frame already taken care of vizdoom config
    
    # Crop the screen (remove the roof because it contains no information)
    #[Up: Down, Left: Right]---->30 down from the top, -10 up from the bottom , 30 from the left ,-30 from the right
    cropped_frame = frame[30:-10, 30:-30]
    
    #Normalize pixel Values
    normalized_frame = cropped_frame/255.0
    
    #Resize
    preprocessed_frame = transform.resize(normalized_frame, [84,84])
    
    return preprocessed_frame 

In [34]:
#At each time-step we stack every fourth frame
#The frame is first preprocessed and the appended to the double ended queue deque
#Every time a frame is appended to the deque it removes the oldest frame from the deque
#Finally we build the stacked state
stack_size = 4        #Stack 4 frames

#Initialize deque with zero-images one array for each image
stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], maxlen = 4)

def stack_frames(stacked_frames, state, is_new_episode):
    #Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        #Clear stacked_frames
        stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], maxlen = 4)
        
        #We are in a new episode, append the same frame 4 times
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        #Stack the frames across axis 2(depth 4) --> Join a sequence of arrays along new axis
        stacked_state = np.stack(stacked_frames, axis = 2)
    else:
        #Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)
        
        """
        Build the stacked state --> Generates a tensor because stacked_frames is currently 
        just a double ended queue.
        """
        stacked_state = np.stack(stacked_frames, axis = 2)
    return stacked_state, stacked_frames

In [35]:
#Hyperparameters

#### Model Hyperparameters
state_size = [84, 84, 4]            #Out input is a stack of 4 frames hence 84x84x4 (Width, height, channels)
action_size = game.get_available_buttons_size()     #3 possbible actions
learning_rate = 0.00025              #Alpha (learning rate)

#### Training Hyperparameters
total_episodes = 1000                #Total episodes for training
max_steps = 200                      #Max possible steps in an episode
batch_size = 64                      #Batch size

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0                  #Exploration probability at start
explore_stop = 0.01                  #Minimum exploration probability
decay_rate = 0.0001                  #Exponential decay rate for exploration probabilty

# Q learning hyperparameters
gamma = 0.90                         #Discounting rate

### Memory hyperparameters
pretrain_length = batch_size         #Number of experiences stored in the Memory when initialized
memory_size = 500000

### Modify this to FALSE if you just want to see the Trained Agnet
training = True

### Turn this to TRUE if you want to render the enviorment
episode_render = True

In [36]:
#Create The Deep Q-Learning Neural Network Model
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name ='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        #Relates to variable sharing
        with tf.variable_scope(name):
            # We create the placeholders
            # state_size = [84,84,4]   ------ *state_size = 84,84,4 basically removes brackets
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name = "inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name = "actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s',a')
            #Maximum possible Qvalue for the next_state = Q_target
            #Estimation since we can't have real Q value
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            """
            First Convolutional Layer
            CNN
            ELU ---> Activation fuction that provides better results for a CNN
            """
            # Input is 84X84x4
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_, 
                                          filters = 32, kernel_size = [8,8], strides = [4,4], padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv1")
            #Batch Normalization 
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1, training = True, 
                                                                 epsilon = 1e-5, name = 'batch_norm1')
            #ELU activation function
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name = "conv1_out")
            ## ----> [20, 20, 32]
            
            """
            Secound Convolutional Layer
            CNN
            ELU
            """
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out, 
                                          filters = 64, kernel_size = [4,4], strides = [2,2], padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv2")
            #Batch Normalization 
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2, training = True, 
                                                                 epsilon = 1e-5, name = 'batch_norm2')
            #ELU activation function
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            ## --> [9, 9, 64]
            
            """
            Third Convolutional Layer
            CNN
            ELU
            """
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out, 
                                          filters = 64, kernel_size = [2,2], strides = [1,1], padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv3")
            #Batch Normalization 
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3, training = True, 
                                                                 epsilon = 1e-5, name = 'batch_norm3')
            
            #ELU activation function
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            ## --> [3, 3, 128]
            
            """
            Flatten
            """
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            ## --> [1152]
            
            """
            Fullyconnected Layers
            512 units -----> action size(8 possible actions)
            """
            self.fc = tf.layers.dense(inputs = self.flatten, units = 512, 
                                     activation = tf.nn.elu, 
                                     kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                     name = "fc1")
            
            #Outputs Q value for each Action
            self.output = tf.layers.dense(inputs = self.fc , units = 3, 
                                     activation = None, 
                                     kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                     name = "output")
            
            
            # Q is our predicted Q value
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis = 1)
            
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [37]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

In [38]:
#Experience Replay
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        
    def add(self, experience):
        self.buffer.append(experience)     #Appends and experience to the expierence buffer
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)     #Attains length of buffer
        #Generates a random sample from a 1D array
        index = np.random.choice(np.arange(buffer_size), size = batch_size, replace = False)
        return [self.buffer[i] for i in index] #Sample of expirences

In [39]:
#Fill empty memory by taking random actions and storing state, next_state, reward, and action
# Instantiate the Memory 
memory = Memory(max_size = memory_size)

# Render the environment
game.new_episode()

#pretrain_length = batch_size = 64 --> Number of expirences in memory when initialized
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
       # First we need a state
        state = game.get_state().screen_buffer
        #Is a new episode so stack the same state 4 times
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()
    
    #If the episode is finished(we're dead 3x)
    if done:
        # We finsihed the episode so we make the last state a zero array
        next_state = np.zeros(state.shape)
        
        # Add experience to memory -- > state and new state(zero array)
        memory.add((state, action, reward, next_state, done))
        
        #Start a new episode
        game.new_episode()
        
        #Attain State
        state = game.get_state().screen_buffer
        
        #Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        #Attain State
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
            
        # Our new state is now the next_state
        state = next_state 

In [40]:
#Setup Tensorboard Writer
writer = tf.summary.FileWriter("./tensorboard/dqn/1")

## Loesses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [43]:
"""
Train the Agent
1.Sampling process --> sample the enviorment where we preform an action and store the observed expirences into
  the replay memory
2.Training select a small batch of random expirences and use it to preform a gradient descent update
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ##EPSLION GREEDY STRATEGY
    #Choose an action from state S using epsilon greedy.
    ##First choose random number from 0 to 1
    exp_exp_tradeoff = np.random.rand()
    
    """
    Here we will use the epsilon greedy strategy--> making use of expoential decay to reduce the epsilon value
    over time. Epislon value will start of as 1 and slowly reduce. As the epsilon value reduces the algorithum 
    will explore less and exploit its attained expirence to gain rewards
    """
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    #Explore
    if (explore_probability > exp_exp_tradeoff):
        #Make a random action (exploration)
        action = random.choice(possible_actions)
    #Exploit knowledge gained from past expireances
    else:
        # Get action from Q-network (exploitation) ---> leverage past expierence 
        # Estimate the Qs values state
        # Note 1 in reshape coresponds to batch column in network
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        #Take the biggest Q value which is the best action --Estimate of largest future reward
        choice = np.argmax(Qs)
        action = possible_actions[choice]
        
    return action, explore_probability

#The saver will save our model
saver = tf.train.Saver()

In [18]:
#The saver will save our model
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        #Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        #Initialize the decay rate that will be used to reduce epislon over time
        decay_step = 0
        
        for episode in range(total_episodes):
            #Set step to 0
            step = 0
            
            # Init the game
            game.init()
            
            #Initialize list containing rewards per episode
            episode_rewards = []
            
            #Make a new episode and observe the first state
            game.new_episode()
            state = game.get_state().screen_buffer
            
            """
            The stacked frame function will also call and make use of the preprocess function.
            The function will crop and reszie the image and then stack them in a double ended queue
            as a way to overcome the temporal limitation problem.
            """
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                
                #increase decay_step
                decay_step += 1
                
                #Predict an action to take and then implement that action
                action, explore_probability = predict_action(explore_start, 
                                                explore_stop, decay_rate, decay_step, state, possible_actions)
                
                #Perform the action and get the next_state, reward, and done information
                reward = game.make_action(action)
                
                #Check if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total reward , store the reward for each action in a list
                episode_rewards.append(reward)
                
                #If the game is finished
                if done:
                    #The episode end so no next state
                    next_state = np.zeros((84,84), dtype = np.int)
                    
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #Set step = max_steps to end the episode
                    step = max_steps
                    
                    #Get the total reward of the episode by summing up the rewards for each action
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))
                    
                   # rewards_list.append((episode, total_reward))
                    
                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))
                else: 
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    #Stack the frames of the next state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #Add Experience to memory
                    memory.add((state, action, reward, next_state, done))
                    
                    #st+1 is now our current state
                    state = next_state
                    
                    ##########PART OF THE ALGORITHUM THAT LEARNS######################
                    #Obtain random mini-batch from memory
                    #Samples memory
                    batch = memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])
                    
                    target_Qs_batch = []
                    
                    #Get Q Values for the next state
                    Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                    
                    #Set Q_traget = r if the episode ends at s+1, else set Q_target = r + gamma*maxQ(s', a')
                    for i in range(0, len(batch)):
                        terminal = dones_mb[i]
                        
                        #If we are in a terminal state, only equals reward
                        if terminal:
                            target_Qs_batch.append(rewards_mb[i])
                        else:
                            target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                            target_Qs_batch.append(target)
                    
                    targets_mb = np.array([each for each in target_Qs_batch])
                    
                    loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})
                    
                    # Write TF Summaries
                    summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                       DQNetwork.target_Q: targets_mb,
                                                       DQNetwork.actions_: actions_mb})
                    writer.add_summary(summary, episode)
                    writer.flush()
                    
            # Save model every 5 episodes
            if episode % 5 == 0:
                    save_path = saver.save(sess, "./models/model.ckpt")
                    print("Model Saved")
                

Episode: 0 Total reward: -27.0 Explore P: 0.9899 Training Loss 165.7057
Model Saved
Model Saved
Episode: 7 Total reward: -5.0 Explore P: 0.8716 Training Loss 0.3430
Episode: 8 Total reward: 9.0 Explore P: 0.8650 Training Loss 40.2349
Model Saved
Episode: 13 Total reward: 95.0 Explore P: 0.7988 Training Loss 1.3169
Episode: 14 Total reward: 73.0 Explore P: 0.7970 Training Loss 0.5638
Model Saved
Episode: 16 Total reward: 91.0 Explore P: 0.7806 Training Loss 6.4960
Episode: 18 Total reward: 94.0 Explore P: 0.7648 Training Loss 7.5944
Episode: 20 Total reward: 95.0 Explore P: 0.7494 Training Loss 7.0559
Model Saved
Episode: 23 Total reward: 92.0 Explore P: 0.7198 Training Loss 0.7300
Episode: 25 Total reward: 31.0 Explore P: 0.7019 Training Loss 0.4396
Model Saved
Episode: 27 Total reward: -1.0 Explore P: 0.6824 Training Loss 5.2571
Episode: 28 Total reward: 23.0 Explore P: 0.6781 Training Loss 2.2497
Episode: 30 Total reward: 93.0 Explore P: 0.6644 Training Loss 1.6465
Model Saved
Episod

Episode: 154 Total reward: 37.0 Explore P: 0.2672 Training Loss 4.1679
Episode: 155 Total reward: 71.0 Explore P: 0.2664 Training Loss 2.5131
Model Saved
Episode: 157 Total reward: 72.0 Explore P: 0.2607 Training Loss 2.4839
Episode: 158 Total reward: 34.0 Explore P: 0.2593 Training Loss 1.7686
Episode: 159 Total reward: 12.0 Explore P: 0.2574 Training Loss 2.9528
Episode: 160 Total reward: 94.0 Explore P: 0.2573 Training Loss 5.6054
Model Saved
Episode: 161 Total reward: -20.0 Explore P: 0.2548 Training Loss 3.3066
Episode: 162 Total reward: 49.0 Explore P: 0.2538 Training Loss 12.5380
Episode: 163 Total reward: 30.0 Explore P: 0.2524 Training Loss 2.4161
Episode: 164 Total reward: 57.0 Explore P: 0.2516 Training Loss 5.3780
Episode: 165 Total reward: 63.0 Explore P: 0.2508 Training Loss 2.3877
Model Saved
Episode: 166 Total reward: 77.0 Explore P: 0.2502 Training Loss 2.2992
Episode: 167 Total reward: 58.0 Explore P: 0.2493 Training Loss 6.0660
Episode: 168 Total reward: 11.0 Explore

Episode: 282 Total reward: 88.0 Explore P: 0.1389 Training Loss 3.3595
Episode: 283 Total reward: 95.0 Explore P: 0.1388 Training Loss 3.2703
Episode: 285 Total reward: 61.0 Explore P: 0.1358 Training Loss 1.5539
Model Saved
Episode: 286 Total reward: 92.0 Explore P: 0.1357 Training Loss 1.4624
Episode: 287 Total reward: 88.0 Explore P: 0.1355 Training Loss 1.4891
Episode: 288 Total reward: 72.0 Explore P: 0.1352 Training Loss 4.2779
Episode: 290 Total reward: 49.0 Explore P: 0.1322 Training Loss 3.8268
Model Saved
Episode: 291 Total reward: 69.0 Explore P: 0.1319 Training Loss 4.7290
Episode: 292 Total reward: 95.0 Explore P: 0.1318 Training Loss 2.0570
Episode: 293 Total reward: 86.0 Explore P: 0.1316 Training Loss 4.1843
Episode: 294 Total reward: 95.0 Explore P: 0.1316 Training Loss 1.8005
Episode: 295 Total reward: 95.0 Explore P: 0.1315 Training Loss 4.0690
Model Saved
Episode: 297 Total reward: 60.0 Explore P: 0.1287 Training Loss 1.8859
Episode: 298 Total reward: 63.0 Explore P

Episode: 399 Total reward: 79.0 Explore P: 0.0981 Training Loss 3.2681
Episode: 400 Total reward: 68.0 Explore P: 0.0979 Training Loss 6.8805
Model Saved
Episode: 401 Total reward: 86.0 Explore P: 0.0977 Training Loss 3.7146
Episode: 402 Total reward: 67.0 Explore P: 0.0975 Training Loss 1.7672
Episode: 403 Total reward: 54.0 Explore P: 0.0971 Training Loss 4.5166
Episode: 404 Total reward: 60.0 Explore P: 0.0968 Training Loss 1.4448
Episode: 405 Total reward: 64.0 Explore P: 0.0965 Training Loss 7.8011
Model Saved
Episode: 406 Total reward: 89.0 Explore P: 0.0964 Training Loss 3.5303
Episode: 407 Total reward: 58.0 Explore P: 0.0961 Training Loss 4.7844
Episode: 408 Total reward: 76.0 Explore P: 0.0959 Training Loss 9.8946
Episode: 409 Total reward: 76.0 Explore P: 0.0957 Training Loss 4.0694
Episode: 410 Total reward: 80.0 Explore P: 0.0956 Training Loss 5.5422
Model Saved
Episode: 411 Total reward: 77.0 Explore P: 0.0954 Training Loss 7.4817
Episode: 412 Total reward: 85.0 Explore P

Episode: 513 Total reward: 80.0 Explore P: 0.0740 Training Loss 3.0969
Episode: 514 Total reward: 74.0 Explore P: 0.0739 Training Loss 3.2449
Episode: 515 Total reward: 58.0 Explore P: 0.0736 Training Loss 2.4500
Model Saved
Episode: 516 Total reward: 86.0 Explore P: 0.0735 Training Loss 4.0780
Episode: 517 Total reward: 93.0 Explore P: 0.0735 Training Loss 6.0169
Episode: 518 Total reward: 61.0 Explore P: 0.0732 Training Loss 2.1515
Episode: 519 Total reward: 61.0 Explore P: 0.0730 Training Loss 6.7753
Episode: 520 Total reward: 60.0 Explore P: 0.0728 Training Loss 2.5635
Model Saved
Episode: 521 Total reward: 69.0 Explore P: 0.0726 Training Loss 3.1743
Episode: 522 Total reward: 95.0 Explore P: 0.0726 Training Loss 2.9943
Episode: 523 Total reward: 94.0 Explore P: 0.0725 Training Loss 2.5272
Episode: 524 Total reward: 94.0 Explore P: 0.0725 Training Loss 2.9811
Episode: 525 Total reward: 48.0 Explore P: 0.0722 Training Loss 4.4569
Model Saved
Episode: 526 Total reward: 67.0 Explore P

Episode: 633 Total reward: 81.0 Explore P: 0.0515 Training Loss 4.3743
Episode: 634 Total reward: 60.0 Explore P: 0.0513 Training Loss 3.4251
Episode: 635 Total reward: 85.0 Explore P: 0.0513 Training Loss 2.4439
Model Saved
Episode: 636 Total reward: 82.0 Explore P: 0.0512 Training Loss 4.3621
Episode: 637 Total reward: 88.0 Explore P: 0.0511 Training Loss 3.9158
Episode: 638 Total reward: 93.0 Explore P: 0.0511 Training Loss 2.3438
Episode: 639 Total reward: 50.0 Explore P: 0.0509 Training Loss 4.6538
Episode: 640 Total reward: 58.0 Explore P: 0.0507 Training Loss 2.9801
Model Saved
Episode: 641 Total reward: 95.0 Explore P: 0.0507 Training Loss 1.5543
Episode: 642 Total reward: 83.0 Explore P: 0.0506 Training Loss 8.5632
Episode: 643 Total reward: 87.0 Explore P: 0.0506 Training Loss 1.4771
Episode: 644 Total reward: 58.0 Explore P: 0.0504 Training Loss 6.4188
Episode: 645 Total reward: 80.0 Explore P: 0.0504 Training Loss 5.1480
Model Saved
Episode: 646 Total reward: 65.0 Explore P

Episode: 749 Total reward: 49.0 Explore P: 0.0399 Training Loss 6.9567
Episode: 750 Total reward: 95.0 Explore P: 0.0399 Training Loss 5.5512
Model Saved
Episode: 751 Total reward: 40.0 Explore P: 0.0398 Training Loss 9.0889
Episode: 752 Total reward: 90.0 Explore P: 0.0397 Training Loss 1.7553
Episode: 753 Total reward: 85.0 Explore P: 0.0397 Training Loss 6.6505
Episode: 754 Total reward: 80.0 Explore P: 0.0396 Training Loss 8.3840
Episode: 755 Total reward: 85.0 Explore P: 0.0396 Training Loss 2.3633
Model Saved
Episode: 756 Total reward: 67.0 Explore P: 0.0395 Training Loss 2.2336
Episode: 757 Total reward: 76.0 Explore P: 0.0394 Training Loss 3.1714
Episode: 758 Total reward: 81.0 Explore P: 0.0394 Training Loss 4.7609
Episode: 759 Total reward: 86.0 Explore P: 0.0393 Training Loss 4.6905
Episode: 760 Total reward: 69.0 Explore P: 0.0392 Training Loss 4.2119
Model Saved
Episode: 761 Total reward: 90.0 Explore P: 0.0392 Training Loss 6.0302
Episode: 762 Total reward: 95.0 Explore P

Episode: 865 Total reward: 64.0 Explore P: 0.0313 Training Loss 6.7560
Model Saved
Episode: 866 Total reward: 81.0 Explore P: 0.0313 Training Loss 3.3870
Episode: 867 Total reward: 91.0 Explore P: 0.0313 Training Loss 4.5489
Episode: 868 Total reward: 74.0 Explore P: 0.0312 Training Loss 2.8897
Episode: 869 Total reward: 60.0 Explore P: 0.0311 Training Loss 4.1775
Episode: 870 Total reward: 77.0 Explore P: 0.0311 Training Loss 9.6188
Model Saved
Episode: 871 Total reward: 88.0 Explore P: 0.0311 Training Loss 2.9421
Episode: 872 Total reward: 60.0 Explore P: 0.0310 Training Loss 11.4547
Episode: 873 Total reward: 92.0 Explore P: 0.0310 Training Loss 26.0982
Episode: 875 Total reward: -15.0 Explore P: 0.0304 Training Loss 7.6093
Model Saved
Episode: 876 Total reward: 72.0 Explore P: 0.0303 Training Loss 3.0582
Episode: 877 Total reward: 84.0 Explore P: 0.0303 Training Loss 7.7211
Episode: 878 Total reward: 82.0 Explore P: 0.0302 Training Loss 1.6527
Episode: 879 Total reward: 83.0 Explor

Episode: 985 Total reward: 54.0 Explore P: 0.0239 Training Loss 5.1025
Model Saved
Episode: 986 Total reward: 92.0 Explore P: 0.0239 Training Loss 1.9653
Episode: 987 Total reward: 77.0 Explore P: 0.0238 Training Loss 1.7970
Episode: 988 Total reward: 86.0 Explore P: 0.0238 Training Loss 3.4187
Episode: 989 Total reward: 85.0 Explore P: 0.0238 Training Loss 4.9621
Episode: 990 Total reward: 95.0 Explore P: 0.0238 Training Loss 3.5199
Model Saved
Episode: 991 Total reward: 86.0 Explore P: 0.0238 Training Loss 5.0193
Episode: 992 Total reward: 61.0 Explore P: 0.0237 Training Loss 6.4897
Episode: 993 Total reward: 73.0 Explore P: 0.0237 Training Loss 10.8818
Episode: 994 Total reward: 38.0 Explore P: 0.0236 Training Loss 6.0405
Episode: 995 Total reward: 95.0 Explore P: 0.0236 Training Loss 2.2631
Model Saved
Episode: 996 Total reward: 95.0 Explore P: 0.0236 Training Loss 2.1976
Episode: 997 Total reward: 94.0 Explore P: 0.0236 Training Loss 4.1743
Episode: 998 Total reward: 95.0 Explore 

In [46]:
with tf.Session() as sess:
    
    game, possible_actions = create_enviroment()
    
    totalScore = 0
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    for i in range(100000):
        
        done = False
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
            
        while not game.is_episode_finished():
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:
                break  
                
            else:
                #print("else")
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
                
        score = game.get_total_reward()
        print("Score: ", score)
    game.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Score:  77.0
Score:  95.0
Score:  74.0
Score:  53.0
Score:  77.0
Score:  84.0
Score:  73.0
Score:  95.0
Score:  86.0
Score:  79.0
Score:  75.0
Score:  88.0
Score:  57.0
Score:  83.0
Score:  68.0
Score:  73.0
Score:  84.0
Score:  92.0
Score:  93.0
Score:  36.0
Score:  83.0
Score:  82.0
Score:  73.0
Score:  1.0
Score:  54.0
Score:  73.0
Score:  73.0
Score:  75.0
Score:  83.0
Score:  82.0
Score:  74.0
Score:  74.0
Score:  95.0
Score:  95.0
Score:  58.0
Score:  54.0
Score:  65.0
Score:  90.0
Score:  92.0
Score:  82.0
Score:  95.0
Score:  54.0
Score:  54.0
Score:  72.0
Score:  84.0
Score:  77.0
Score:  81.0
Score:  74.0
Score:  77.0
Score:  73.0
Score:  88.0
Score:  80.0
Score:  84.0
Score:  74.0
Score:  84.0
Score:  53.0
Score:  77.0
Score:  77.0
Score:  95.0
Score:  69.0
Score:  88.0
Score:  67.0
Score:  95.0
Score:  81.0
Score:  95.0
Score:  82.0
Score:  95.0
Score:  54.0
Score:  85.0
Score:  77.0
Score:  95.0
Score:  82.0
Sco

Score:  86.0
Score:  95.0
Score:  77.0
Score:  84.0
Score:  95.0
Score:  77.0
Score:  57.0
Score:  67.0
Score:  86.0
Score:  81.0
Score:  80.0
Score:  73.0
Score:  85.0
Score:  50.0
Score:  54.0
Score:  80.0
Score:  54.0
Score:  85.0
Score:  83.0
Score:  77.0
Score:  35.0
Score:  86.0
Score:  57.0
Score:  95.0
Score:  81.0
Score:  54.0
Score:  89.0
Score:  95.0
Score:  95.0
Score:  68.0
Score:  83.0
Score:  95.0
Score:  89.0
Score:  77.0
Score:  72.0
Score:  83.0
Score:  81.0
Score:  77.0
Score:  81.0
Score:  65.0
Score:  73.0
Score:  57.0
Score:  19.0
Score:  83.0
Score:  74.0
Score:  86.0
Score:  83.0
Score:  95.0
Score:  95.0
Score:  77.0
Score:  95.0
Score:  77.0
Score:  95.0
Score:  73.0
Score:  95.0
Score:  53.0
Score:  77.0
Score:  77.0
Score:  89.0
Score:  75.0
Score:  79.0
Score:  95.0
Score:  81.0
Score:  89.0
Score:  53.0
Score:  95.0
Score:  95.0
Score:  77.0
Score:  82.0
Score:  95.0
Score:  54.0
Score:  77.0
Score:  65.0
Score:  77.0
Score:  85.0
Score:  73.0
Score:  73.0

Score:  16.0
Score:  77.0
Score:  54.0
Score:  85.0
Score:  73.0
Score:  95.0
Score:  70.0
Score:  80.0
Score:  73.0
Score:  54.0
Score:  79.0
Score:  95.0
Score:  69.0
Score:  84.0
Score:  83.0
Score:  95.0
Score:  54.0
Score:  73.0
Score:  92.0
Score:  81.0
Score:  35.0
Score:  88.0
Score:  73.0
Score:  80.0
Score:  64.0
Score:  77.0
Score:  85.0
Score:  68.0
Score:  82.0
Score:  81.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  84.0
Score:  69.0
Score:  80.0
Score:  95.0
Score:  81.0
Score:  92.0
Score:  92.0
Score:  73.0
Score:  77.0
Score:  95.0
Score:  87.0
Score:  72.0
Score:  54.0
Score:  49.0
Score:  85.0
Score:  95.0
Score:  91.0
Score:  77.0
Score:  77.0
Score:  16.0
Score:  73.0
Score:  95.0
Score:  87.0
Score:  54.0
Score:  95.0
Score:  95.0
Score:  54.0
Score:  95.0
Score:  89.0
Score:  95.0
Score:  35.0
Score:  81.0
Score:  80.0
Score:  95.0
Score:  95.0
Score:  77.0
Score:  74.0
Score:  72.0
Score:  82.0
Score:  82.0
Score:  77.0
Score:  74.0
Score:  73.0
Score:  88.0

Score:  95.0
Score:  82.0
Score:  73.0
Score:  95.0
Score:  94.0
Score:  95.0
Score:  50.0
Score:  54.0
Score:  84.0
Score:  77.0
Score:  77.0
Score:  82.0
Score:  73.0
Score:  57.0
Score:  82.0
Score:  95.0
Score:  73.0
Score:  83.0
Score:  77.0
Score:  74.0
Score:  83.0
Score:  82.0
Score:  80.0
Score:  95.0
Score:  58.0
Score:  95.0
Score:  95.0
Score:  81.0
Score:  85.0
Score:  77.0
Score:  83.0
Score:  92.0
Score:  85.0
Score:  85.0
Score:  84.0
Score:  77.0
Score:  77.0
Score:  54.0
Score:  73.0
Score:  77.0
Score:  69.0
Score:  82.0
Score:  68.0
Score:  95.0
Score:  82.0
Score:  95.0
Score:  82.0
Score:  82.0
Score:  77.0
Score:  85.0
Score:  79.0
Score:  69.0
Score:  -46.0
Score:  93.0
Score:  73.0
Score:  82.0
Score:  95.0
Score:  83.0
Score:  73.0
Score:  95.0
Score:  54.0
Score:  95.0
Score:  95.0
Score:  77.0
Score:  73.0
Score:  95.0
Score:  69.0
Score:  85.0
Score:  35.0
Score:  73.0
Score:  84.0
Score:  92.0
Score:  84.0
Score:  54.0
Score:  95.0
Score:  95.0
Score:  83.

SignalException: Signal SIGINT received. ViZDoom instance has been closed.