# Deep Q Network 

- deep reinforcement learning algorithm 

### Q function / state-action value function
- this function evaluates how good an action a is in state s. 
- we do this by creating a Q table 
- which stores all actions possible in each state - and each have a q value. 
- we then tend to pick the action for state s which has the highest q value. This was used in the Q learning in the temporal difference learning chapter

### Why use DQNs? 

- We use DQNs when the environment has so many states that it would be time consuming to calculate everything. 
- What is we have an environment with LOTS of states? hundreds of states, thousands of states! To calculate the q value for each action per state, Big-O would be exponential & time consuming... since we would have to go through and perform each of the actions in each state. 
- It would be better to use a neural network with weight theta to approximate the q value for each state. Thus we call this q-value approximating neural network a "Q network."

# Experience replay
- RL environments have transitions from state s to next state s' ... determined by perfoming actions and recieving rewards. 
- The transitions are called the agents experience. 

### What is Experience replay?
- We train Deep Q network with transitions sampled from the replay buffer. 
- Using a replay buffer stops overfitting. 
- The replay buffer is more of a que than a list - only a fixed number of recent experiences are stored... and when new information comes in, the old information is deleted. 

# Target Network. 

- loss function calculates squared difference between target and predicted value
- A separate neural network called a target network for calculating target. 

- Q network predicts Q values.
- Target network calculates target. 

- The target network is frozen for several time steps - and then the T-network weights are updated by copying the weights of actual Q network.

In [1]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected 
from collections import deque, Counter
import random
from datetime import datetime

In [2]:
env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n

[2018-10-28 20:03:18,267] Making new env: MsPacman-v0


In [3]:
# function to preprocess input game screen - to reduce image size and convert to greyscale
color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):
    img = obs[1:176:2, ::2]
    img = img.mean(axis=2)
    img[img==color] = 0
    img = (img-128) / (128-1)
    return img.reshape(88,80,1)

In [4]:
# define Q network 
# Q network inputs: game state x
# The Q network architecture:
# - is built with convolutional layers with padding, followed by a fully connected layer. 

tf.reset_default_graph()

def q_network(X, name_scope):
    # intialise layers
    initializer = tf.contrib.layers.variance_scaling_initializer()
    
    with tf.variable_scope(name_scope) as scope:
        # initialize convolutional layers
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding="SAME", weights_initializer=initializer)
        tf.summary.histogram("layer1", layer_1)
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding="SAME", weights_initializer=initializer)
        tf.summary.histogram("layer2", layer_2)
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding="SAME", weights_initializer=initializer)
        tf.summary.histogram("layer3", layer_3)
        # flatten the result of layer_3 before feeding to the fully connected layer. 
        flat = flatten(layer_3)
        
        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram("fc", fc)
        output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram("output", output)
        
        # vars will store the parameters of the network such as weights
        vars2 = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
        return(vars2, output)
        
        

In [5]:
# Epsilon greedy function

epsilon = 0.5
eps_min = 0.05
eps_max = 1.0 
eps_decay_steps = 500000
def epsilon_greedy(action, step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step / eps_decay_steps)
    if np.random.rand() < epsilon:
        return(np.random.randint(n_outputs))
    else:
        return(action)

In [6]:
# Store agents experiences (state, action, reward) in experience replay buffer
# and sample minibatches of experiences for training network

def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [7]:
# define hyper parameters
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001 
X_shape = (None, 88, 80, 1)
discount_factor = 0.97 

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'

In [8]:
X = tf.placeholder(tf.float32, shape=X_shape)
in_training_mode = tf.placeholder(tf.bool)

In [9]:
mainQ, mainQ_outputs = q_network(X, "mainQ")

In [10]:
targetQ, targetQ_outputs = q_network(X, "targetQ")

In [11]:
X_action = tf.placeholder(tf.int32, shape=(None,))

In [12]:
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

In [13]:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]


In [14]:
copy_target_to_main = tf.group(*copy_op)

In [15]:
y = tf.placeholder(tf.float32, shape=(None, 1))

In [16]:
loss = tf.reduce_mean(tf.square(y - Q_action))

In [17]:
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

In [18]:
loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [23]:
exp_buffer = []

In [25]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    # for each episode
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter()
        episodic_loss = []
        
        # while the state is not the terminal state: 
        while not done:
            # env.render()
            # get preprocessed game screen
            obs = preprocess_observation(obs)
            # feed the game screen and get the q values for each action
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            
            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1
            
            # select the action using epsilon greedy policy
            action = epsilon_greedy(action, global_step)
            
            # perform the action and move to the next state + recieve reward
            next_obs, reward, done, _ = env.step(action)
            
            # store the transition as an experience in the replay buffer
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            # after certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train ==0 and global_step > start_steps: 
                # sample experience
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
                
                # 1. states
                o_obs = [x for x in o_obs]
                
                # 2. next_states
                o_next_obs = [x for x in o_next_obs]
                
                # 3. next actions 
                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                
                # reward
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done)
                
                # merge all summaries and write to the file
                
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                
                file_writer.add_summary(mrg_summary, global_step)
                
                # train network and calculate loss
                
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                
                episodic_loss.append(training_loss)
                
                # after some interval we copy our main Q network weights to target Q network
                if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                    copy_target_to_main.run()
                
                obs = next_obs
                epoch += 1
                global_step += 1
                episodic_reward += reward
            print("Epoch", epoch, "Reward", episodic_reward,)

Epoch 0 Reward 0


ValueError: cannot reshape array of size 1760 into shape (88,80,1)

In [26]:
# https://medium.com/@jonathan_hui/rl-dqn-deep-q-network-e207751f7ae4

In [27]:
# https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26