In [None]:
from vizdoom import *
import tensorflow as tf 
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

import random
import time
import datetime

from skimage import transform
from skimage.color import rgb2gray

import warnings
warnings.filterwarnings('ignore')

game = DoomGame()
#YOUR PATH TO DEATHMATCH CONFIG/ViZDoom-master/scenarios/deathmatch.cfg
#game.load_config("vizdoom/scenarios/deathmatch.cfg")
game.load_config("YOUR PATH/ViZDoom-master/scenarios/basic.cfg")
game.set_screen_resolution(ScreenResolution.RES_1280X1024)
game.get_available_buttons()

buttons = game.get_available_buttons()
print(buttons)
actions = []
actions_dict = {}
for i in range(buttons.index(Button.ATTACK)+1):
    act_name = str(buttons[i])[7:]
    act = []
    for k in range(buttons.index(Button.ATTACK)+1):
        act.append(0)
    act[i] = 1
    actions.append(act)
    actions_dict[str(act_name)+'-'+str(i+1)]=act

In [None]:
class Agent:
    def __init__(self):
        
        self.gamma = 0.95
        
        self.explore_start = 1.0
        self.explore_stop = 0.01
        self.decay_rate = 0.0001
    
        self.state = self.get_state()
        self.actions = self.get_actions()
        self.impressions = 0
        self.stack_size = 3
        
        self.tensorflowboard = 'YOUR PATH TO LOG FILES'
        
        #memory
        #by default - 100000 cell 
        #will be 4 cells in each cell -> experience tuple()
        self.memory = self.Memory(capacity=100000)
        self.frame = self.make_frame()
        
        #models: for Q and target Q
        self.model = self.build_model()
        self.target_model = self.build_model()
        
           
    def get_state(self, state=np.zeros((3, 1024, 1280), dtype='uint8')):
        state = state
        return state
                          #attack
                          #for another game
                          #actions=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                           
    def get_actions(self, actions=[1, 0, 0]):
        actions = actions
        return actions
    
    def act(self, state, actions=actions):
        if np.random.rand() <= self.explore_start:
            action = random.choice(actions)
            return action
        values_of_actions = self.model.predict(state)
        action = actions[np.argmax(values_of_actions)]
        return action  
    
    def make_frame(self, state=np.zeros((3, 1024, 1280), dtype='uint8')):
        #вычитание фреймов друг из друга передает динамику (velocity)
        dymanic_frame = np.array(state[0]) - np.array(state[1])
        gray_frame = rgb2gray(dymanic_frame)
        cropped_frame = dymanic_frame[150:, :]
        resized_frame = transform.resize(cropped_frame, [500, 500])
        frame = np.reshape(resized_frame, (1, 500, 500, 1))
        return frame
    
    def get_impressions(self):
        #64 refers to maximum backet
        if(self.impressions) < 64:
            impressions = self.impressions + 1
            return impressions
        return self.impressions
        
    def experience(self, frame, action, reward, next_frame, done):
        experience = (frame, action, reward, next_frame, done)
        self.memory.store(experience)
        
    #for main and fix-target networks
    
    def build_model(self):
        print('Print after training: tensorboard --logdir={}'.format(self.tensorflowboard))
        #delete non-actual models in tensorflowboard (because every new object of Agent() gets NEW model)
        tf.keras.backend.clear_session()
        
        input_layer = tf.keras.Input(shape=[500, 500, 1])
        conv2D_1 = tf.keras.layers.Conv2D(
                                         #input_shape=(500, 500, 1),
                                         data_format='channels_last',
                                         filters=32, 
                                         kernel_size=[60, 60],
                                         strides = (4, 4),
                                         padding = 'valid',
                                         activation='tanh')(input_layer)
        batchNormalization_1 = tf.keras.layers.BatchNormalization(epsilon=0.0001)(conv2D_1)
        conv2D_2 = tf.keras.layers.Conv2D(filters=64, 
                                         kernel_size=[40, 40],
                                         strides = (4, 4),
                                         padding = 'valid',
                                         activation='tanh')(batchNormalization_1)
        batchNormalization_2 = tf.keras.layers.BatchNormalization(epsilon=0.0001)(conv2D_2)
        conv2D_3 = tf.keras.layers.Conv2D(filters=128, 
                                         kernel_size=[10, 10],
                                         strides = (4, 4),
                                         padding = 'valid',
                                         activation='tanh')(batchNormalization_2)
        batchNormalization_3 = tf.keras.layers.BatchNormalization(epsilon=0.0001)(conv2D_3)
        flatten = tf.keras.layers.Flatten()(batchNormalization_3)
        
        #Here we separate into two streams
        #The one that calculate V(s)
        value_pr = tf.keras.layers.Dense(700, activation='tanh')(flatten)
        value = tf.keras.layers.Dense(1, activation=None)(value_pr)
        
        #The one that calculate A(s,a)
        advantage_pr = tf.keras.layers.Dense(700, activation='tanh')(flatten)
        advantage = tf.keras.layers.Dense(units=len(self.actions), activation=None)(advantage_pr)
        # Aggregating layer
        # Q(s,a) = V(s) + (A(s,a) - 1/|A| * sum A(s,a'))
        added = tf.keras.layers.Add()([value, tf.math.subtract(advantage, tf.math.reduce_mean(advantage, axis=1, keepdims=True))])
        output_layer = tf.keras.layers.Dense(len(self.actions), activation=None)(added)
        model = tf.keras.Model(inputs = input_layer, outputs = output_layer)
        model.compile(tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse',
                      metrics=['accuracy', 'mean_squared_error'])
        model.summary()
        return model
    
    def update_target_model(self):
        # copy weights from model to target_model
        self.target_model.set_weights(self.model.get_weights())
        print('Weights of TargetNet updated')
    
    def replay(self, batch_size=1):
        tree_idx, batches, ISWeights_mb = self.memory.sample(batch_size)
        states = []
        target_Qs_batch = []    
        
        for batch in batches:
            for state, action, reward, next_state, done in batch:
                #this target will be only it was frame before end.
                target = reward
                print('Reward: ' + str(target))
                if not done:
                        #Double DNN learning aglorithm
                        #Main Network predict actions for next state for selection of action
                        target_by_MainNet = self.model.predict(next_state)[0]
                        #index of best action
                        Next_action_by_MainNet = np.argmax(target_by_MainNet)
                        #Target Network predict next state to evalute actions and take action, 
                        #which was predicted by Main
                        target_by_TargetNet = self.target_model.predict(next_state)[0]
                        target = (reward + self.gamma * target_by_TargetNet[Next_action_by_MainNet])
                target_f = agent.model.predict(state)
                action_that_was_made = agent.actions.index(action)
                predicted_value = target_f[0][action_that_was_made]

                target_f[0][action_that_was_made] = target
                actual_value = target_f[0][action_that_was_made]

                loss = actual_value - predicted_value
                print('Loss is: ' + str(loss))

                #x and y for model.fit
                states.append(state)
                target_Qs_batch.append(target_f[0])
        #make list to numpy array for model; reshape states to pass to model.fit
        states = np.array([each for each in states])
        states = np.reshape(states, (len(states), 500, 500, 1))
        target_Qs_batch = np.array([each for each in target_Qs_batch])
        print('Learning!')
        
        #logs to tensorflowboard
        log_dir= self.tensorflowboard + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
        
        #model train
        self.model.fit(states, target_Qs_batch, batch_size=batch_size, epochs=1, verbose=0, callbacks=[tensorboard_callback])
            #print(self.model.evaluate(state, target_f, verbose=0)[0])
        print('End of learning')
            
    def save_network(self, path='/Users/s.matrosov/Downloads/Doom_Model'):
        # Saves model at specified path as h5 file
        #tf.keras.backend.clear_session()
        self.model.save(path)
        print("Successfully saved network.")

    def load_network(self, path='/Users/s.matrosov/Downloads/Doom_Model/'):
        #tf.keras.backend.clear_session()
        self.model = tf.keras.models.load_model(path)
        print("Succesfully loaded network.")
            
    def increase_explotation(self, decay_step = 1):
        explore_start = self.explore_stop + (self.explore_start - self.explore_stop) * np.exp(-self.decay_rate * decay_step)
        return explore_start
    
    
    #Memory, based on SumTree
    class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
        """
        This SumTree code is modified version and the original code is from:
        https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
        """
        PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
        PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
        PER_b = 0.4  # importance-sampling, from initial value increasing to 1

        PER_b_increment_per_sampling = 0.001

        absolute_error_upper = 1.  # clipped abs error

        def __init__(self, capacity):
            # Making the tree 
            """
            Remember that our tree is composed of a sum tree that contains the priority scores at his leaf
            And also a data array
            We don't use deque because it means that at each timestep our experiences change index by one.
            We prefer to use a simple array and to overwrite when the memory is full.
            """
            self.tree = self.SumTree(capacity)

        """
        Store a new experience in our tree
        Each new experience have a score of max_prority (it will be then improved when we use this exp to train our DDQN)
        """
        def store(self, experience):
            # Find the max priority
            max_priority = np.max(self.tree.tree[-self.tree.capacity:])

            # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected
            # So we use a minimum priority
            if max_priority == 0:
                max_priority = self.absolute_error_upper

            self.tree.add(max_priority, experience)   # set the max p for new p


        """
        - First, to sample a minibatch of k size, the range [0, priority_total] is / into k ranges.
        - Then a value is uniformly sampled from each range
        - We search in the sumtree, the experience where priority score correspond to sample values are retrieved from.
        - Then, we calculate IS weights for each minibatch element
        """
        def sample(self, n):
            # Create a sample array that will contains the minibatch
            memory_batch = []
                             #1D array with 1 row of n random numbers; #2D array with 1 columns and n rows
            batch_idx, batch_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)

            # Calculate the priority segment
            # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
            priority_segment = self.tree.total_priority / n       # priority segment

            # Here we increasing the PER_b each time we sample a new minibatch
            self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1

            # Calculating the max_weight
            p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
            max_weight = (p_min * n) ** (-self.PER_b)

            for i in range(n):
                """
                A value is uniformly sample from each range
                """
                low, high = priority_segment * i, priority_segment * (i + 1)
                value = np.random.uniform(low, high)

                """
                Experience that correspond to each value is retrieved
                """
                index, priority, data = self.tree.get_leaf(value)

                #P(j)
                sampling_probabilities = priority / self.tree.total_priority

                #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
                batch_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight

                batch_idx[i]= index

                experience = [data]

                memory_batch.append(experience)

            return batch_idx, memory_batch, batch_ISWeights

        """
        Update the priorities on the tree
        """
        def batch_update(self, tree_idx, abs_errors):
            abs_errors += self.PER_e  # convert to abs and avoid 0
            clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
            ps = np.power(clipped_errors, self.PER_a)

            for ti, p in zip(tree_idx, ps):
                self.tree.update(ti, p)
                
        class SumTree(object):
            data_pointer = 0
            """
            Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
            """
            def __init__(self, capacity):
                # Number of leaf nodes (final nodes) that contains experiences
                self.capacity = capacity 
                # Generate the tree with all nodes values = 0
                # To understand this calculation (2 * capacity - 1) look at the schema above
                # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
                # Parent nodes = capacity - 1
                # Leaf nodes = capacity
                self.tree = np.zeros(2 * capacity - 1)

                """ tree:
                    0
                   / \
                  0   0
                 / \ / \
                0  0 0  0  [Size: capacity] it's at this line that there is the priorities score (aka pi)
                """

                # Contains the experiences (so the size of data is capacity)
                self.data = np.zeros(capacity, dtype=object)


            """
            Here we add our priority score in the sumtree leaf and add the experience in data
            """
            def add(self, priority, data):
                # Look at what index we want to put the experience
                tree_index = self.data_pointer + self.capacity - 1

                """ tree:
                    0
                   / \
                  0   0
                 / \ / \
        tree_index  0 0  0  We fill the leaves from left to right
                """

                # Update data frame
                self.data[self.data_pointer] = data

                # Update the leaf
                self.update (tree_index, priority)

                # Add 1 to data_pointer
                self.data_pointer += 1

                if self.data_pointer >= self.capacity: 
                    self.data_pointer = 0


            """
            Update the leaf priority score and propagate the change through tree
            """
            def update(self, tree_index, priority):
                # Change = new priority score - former priority score
                change = priority - self.tree[tree_index]
                self.tree[tree_index] = priority

                # then propagate the change through tree
                while tree_index != 0:    # this method is faster than the recursive loop in the reference code

                    """
                    Here we want to access the line above
                    THE NUMBERS IN THIS TREE ARE THE INDEXES NOT THE PRIORITY VALUES

                        0
                       / \
                      1   2
                     / \ / \
                    3  4 5  [6] 

                    If we are in leaf at index 6, we updated the priority score
                    We need then to update index 2 node
                    So tree_index = (tree_index - 1) // 2
                    tree_index = (6-1)//2
                    tree_index = 2 (because // round the result)
                    """
                    tree_index = (tree_index - 1) // 2
                    self.tree[tree_index] += change


            """
            Here we get the leaf_index, priority value of that leaf and experience associated with that index
            """
            def get_leaf(self, v):
                """
                Tree structure and array storage:
                Tree index:
                     0         -> storing priority sum
                    / \
                  1     2
                 / \   / \
                3   4 5   6    -> storing priority for experiences
                Array type for storing:
                [0,1,2,3,4,5,6]
                """
                parent_index = 0

                while True: # the while loop is faster than the method in the reference code
                    left_child_index = 2 * parent_index + 1
                    right_child_index = left_child_index + 1

                    # If we reach bottom, end the search
                    if left_child_index >= len(self.tree):
                        leaf_index = parent_index
                        break

                    else: # downward search, always search for a higher priority node

                        if v <= self.tree[left_child_index]:
                            parent_index = left_child_index

                        else:
                            v -= self.tree[left_child_index]
                            parent_index = right_child_index

                data_index = leaf_index - self.capacity + 1

                return leaf_index, self.tree[leaf_index], self.data[data_index]

            @property
            def total_priority(self):
                return self.tree[0] # Returns the root node

In [None]:
path = 'YOUR PATH'

In [None]:
agent = Agent()
#avaible_actions
agent.actions = agent.get_actions(actions)

#build new model
agent.build_model()
agent.save_network(path)

#load existing model
#agent.load_network(path)

In [None]:
#agent.save_network(path)

In [None]:
agent.explore_start = 0.2

episodes = 1500
max_steps = 100 

game.init()
total_episode_rewards = []
timestamps = []
timestamp = 1

for i in range(episodes):
    print('Episode ' + str(i))
    print('Exploration rate: {}'.format(agent.explore_start))
    game.new_episode()
    
    finished = game.is_episode_finished()
    if finished:
        print('Finished? - ' + str(finished))
        game.close()
        game.init()
        game.new_episode()
    
    episode_rewards = []
    step = 0
    while step < max_steps:
        #стратовый фрэйм
        if game.is_new_episode():
            #получайем стек фреймов
            state = game.get_state().screen_buffer
            test_state = state
            #агент все это превращает в фрейм вида: фрейм1 минус фрейм2 минус фрейм3
            agent.frame = agent.make_frame(state)
            #теперь состояние - это и есть этот фрейм
            state = agent.frame
            
            agent.impressions = agent.get_impressions()
            
            action = agent.act(state)
            
            reward = game.make_action(action)
            episode_rewards.append(reward)
            
            next_state = game.get_state().screen_buffer
            agent.frame = agent.make_frame(next_state)
            next_state = agent.frame
            
            done = game.is_episode_finished()
            
            agent.experience(state, action, reward, next_state, done)
            step = step + 1
            time.sleep(0.02)
            
        #все остальные фреймы и до конца
        else:
            state = game.get_state().screen_buffer
            agent.frame = agent.make_frame(state)
            state = agent.frame
            
            #для бакета
            agent.impressions = agent.get_impressions()
            
            action = agent.act(state)
            
            reward = game.make_action(action)
            episode_rewards.append(reward)
            
            done = game.is_episode_finished()
            
            if done:
                step = max_steps
                #black screen
                next_state = np.zeros((3, 1024, 1280), dtype='uint8')
                agent.frame = agent.make_frame(next_state)
                next_state = agent.frame
                
                agent.experience(state, action, reward, next_state, done)    
                
                agent.explore_start = agent.increase_explotation()
                
                time.sleep(0.02)
                break
            
            else:
                next_state = game.get_state().screen_buffer
                agent.frame = agent.make_frame(next_state)
                next_state = agent.frame
                
                agent.experience(state, action, reward, next_state, done)
                agent.explore_start = agent.increase_explotation()
                
                step = step + 1
                time.sleep(0.02) 
    #training           
    agent.replay(agent.impressions)
    
    #update q targets weights
    #+1 made for not saving since start
    if (i+1) % 15 == 0:
        agent.update_target_model()
    
    #save model every 5 episodes
    if (i+1) % 5 == 0:
        agent.save_network(path)
         
    #for plot
    total_episode_rewards.append(game.get_total_reward())
    timestamps.append(timestamp)
    timestamp = timestamp + 1
    
    print ("Result:", game.get_total_reward())
    time.sleep(4)
game.close()
agent.save_network(path)
agent.update_target_model()

In [None]:
agent.save_network(path)
#agent.update_target_model()

In [None]:
print(agent.explore_start)

In [None]:
agent.save_network(path)
#agent.update_target_model()

In [None]:
plt.plot(timestamps, total_episode_rewards)

In [None]:
#agent will use model to act
agent.explore_start = 0.01

#games and steps in 1 episode
episodes = 1500
max_steps = 100 

game.init()
#game_rewards = []
#game_timestamps = []
#timestamp = 1

for i in range(episodes):
    print('Episode ' + str(i))
    game.new_episode()
    
    finished = game.is_episode_finished()
    if finished:
        print('Finished? - ' + str(finished))
        game.close()
        game.init()
        game.new_episode()
    
    episode_rewards = []
    step = 0
    while step < max_steps:
        if game.is_new_episode():
            state = game.get_state().screen_buffer
            test_state = state
            agent.frame = agent.make_frame(state)
            state = agent.frame
            
            action = agent.act(state)
            
            reward = game.make_action(action)
            episode_rewards.append(reward)
            done = game.is_episode_finished()
            
            step = step + 1
            time.sleep(0.02)
        #все остальные фреймы и до конца
        else:
            state = game.get_state().screen_buffer
            agent.frame = agent.make_frame(state)
            state = agent.frame
            
            action = agent.act(state)
            
            reward = game.make_action(action)
            episode_rewards.append(reward)
            
            done = game.is_episode_finished()
            
            if done:
                step = max_steps
                total_reward = np.sum(episode_rewards)
                time.sleep(0.02)
                break
            else:
                step = step + 1
                if step == 199:
                    total_reward = np.sum(episode_rewards)
                time.sleep(0.02)        
    print ("Result:", game.get_total_reward())
    time.sleep(2)
game.close()

In [None]:
#tensorboard --logdir=/Users/s.matrosov/Downloads/Doom_Model/3/policy_gradients/
%tensorboard --logdir '/Users/s.matrosov/Downloads/Doom_Model/3/policy_gradients/'