## Import Dependencies

In [2]:
import gym

import numpy as np
import tensorflow as tf
import tensorflow.keras as ks

## Create the Environment

You control a vehicle that can move sideways. A big mother ship circles overhead and continually deploys smaller drones. You must destroy these enemies and dodge their attacks.

In [3]:
env = gym.make("ALE/Assault-v5")

In [4]:
#Atari preprocessing wrapper
env = gym.wrappers.AtariPreprocessing(env, 
                                      noop_max=30, 
                                      frame_skip=1, 
                                      terminal_on_life_loss=False, 
                                      grayscale_obs=True, 
                                      grayscale_newaxis=False, 
                                      scale_obs=False)

#Frame stacking
env = gym.wrappers.FrameStack(env, 4)

## Define Buffer class for Experience Replay

In this class we only have to define the buffer dimension that will limit the number of samples contained by the buffer. This limits the amount of memory required by the program and avoi problems.

In [5]:
class Buffer:
    def __init__(self, buffer_dim, batch_size=32):
        self.state = []
        self.action = []
        self.reward = []
        self.done = []
        self.next_state = []
        
        self.buffer_dim = buffer_dim
        self.batch_size = batch_size
        self.idx = 0
        
    def save_sample(self, state, action, reward, done, next_state):
        self.state.append(state)
        self.action.append(action)
        self.reward.append(reward)
        self.done.append(done)
        self.next_state.append(next_state)
        self.idx += 1
        
        if self.idx >= buffer_dim:
            del self.state[:1]
            del self.action[:1]
            del self.reward[:1]
            del self.done[:1]
            del self.next_state[:1]
        
    def get_sample(self):
        
        last = self.idx % self.buffer_dim
        idx = np.random.randint(0, self.idx) % self.buffer_dim
        
        # We can't take the last 'batch_size' samples
        while last-self.batch_size < idx <= last:
            idx = np.random.randint(0, self.idx) % self.buffer_dim
        
        
        return (np.array(self.state[idx:idx+self.batch_size]),
                self.action[idx:idx+self.batch_size],
                self.reward[idx:idx+self.batch_size],
                tf.convert_to_tensor([float(self.done[i]) for i in range(idx, idx+self.batch_size)]),
                np.array(self.next_state[idx:idx+self.batch_size]))

## Define Deep Q-Network

This network **learns an approximation of the Q-table**, which is a mapping between the states and actions that an agent will take. For every state we'll have four actions, that can be taken. The environment provides the state, and the action is chosen by selecting the larger of the four Q-values predicted in the output layer.

In [20]:
env.observation_space.shape

(4, 84, 84)

In [21]:
num_actions = env.action_space.n
input_shape = (84, 84, 4)

In [22]:
def get_Q_model(input_shape, num_actions):
    inp = ks.layers.Input(input_shape)
    
    x = ks.layers.Conv2D(32, 8, strides=4, activation='relu', padding='same')(inp)
    x = ks.layers.Conv2D(64, 4, strides=2, activation='relu', padding='same')(x)
    x = ks.layers.Conv2D(64, 3, strides=1, activation='relu', padding='same')(x)
    
    x = ks.layers.Flatten()(x)
    x = ks.layers.Dense(512, activation='relu')(x)
    out = ks.layers.Dense(num_actions, activation='linear')(x)
    
    model = ks.Model(inp, out)
    
    return model

model = get_Q_model(input_shape, num_actions)
model_target = get_Q_model(input_shape, num_actions)

# Optimizer and Loss function
optimizer = ks.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = ks.losses.Huber()

## Training

The DQN algorithm can be describes as follows:

1. **Initialize replay buffer**,

2. Pre-process and the environment and **feed state S to DQN**, which will return the Q values of all possible actions in the state.

3. **Select an action** using the epsilon-greedy policy: with the probability epsilon, we select a random action A and with probability 1-epsilon. Select an action that has a maximum Q value, such as A = argmax(Q(S, A, θ)).

4. After selecting the action A, the Agent **performs chosen action** in a state S and move to a new state S’ and receive a reward R.

5. **Store transition** in replay buffer as <S,A,R,S’>.

6. Next, **sample some random batches of transitions** from the replay buffer and calculate the loss using the formula:

7. **Perform gradient descent** with respect to actual network parameters in order to minimize this loss.

8. After every k steps, **copy our actual network weights to the target network weights**.

9. Repeat these steps for M number of episodes.

In [23]:
# PARAMETERS
episodes = 1000

gamma = 0.99
epsilon_start = 1
epsilon_end = 0.1
decade_period = 1000000

update_period = 10000

buffer_dim = 1000000
batch_size = 32

In [24]:
epsilon = epsilon_start

buffer = Buffer(buffer_dim, batch_size)

episode_reward_history = []
average_reward = 0
step = 0
for episode in range(1, episodes+1):
    
    state = np.array(env.reset())
    state = np.transpose(state, [1, 2, 0])
    done = False
    episode_reward = 0
    
    while not done:
        
        #env.render()
        
        # Choose action
        if np.random.uniform() < epsilon:
            action = np.random.choice(num_actions)
        else:
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()
            
        # Epsilon decay
        rate = np.max((decade_period - step) / decade_period, 0)
        epsilon = (epsilon_start - epsilon_end) * rate + epsilon_end
        
        # Apply the sampled action in our environment
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state)
        next_state = np.transpose(next_state, [1, 2, 0])
        
        episode_reward += reward
        
        # Save actions and states in replay buffer
        buffer.save_sample(state, action, reward, done, next_state)
        
        state = next_state
        step += 1
        
        # After batch_size steps we can start sampling 
        if step > batch_size:
            state_sample, action_sample, reward_sample, done_sample, next_state_sample = buffer.get_sample()
            
            # Q-values estimates (TARGET MODEL) for the sampled future states
            future_rewards = model_target.predict(next_state_sample)
            
            # Current Q-value estimate
            updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            
            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            
            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)
            
            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)
                
                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)
                
            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Update the model
        if step % update_period == 0:
            
            # Set the new weights
            model_target.set_weights(model.get_weights())
            
            print(f"Average reward: {average_reward:.2f} at episode {episode}")
            
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    average_reward = np.mean(episode_reward_history)
    
#env.close()
















KeyboardInterrupt



## Testing

In [25]:
for episode in range(1, episodes+1):
    
    state = np.array(env.reset())
    done = False
    episode_reward = 0
    
    while not done:
        
        env.render()
        
        state = np.transpose(state, [1, 2, 0])
        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = model(state_tensor, training=False)
            
        # Take best action
        action = tf.argmax(action_probs[0]).numpy()
        
        # Apply the sampled action in our environment
        state, reward, done, _ = env.step(action)
        state = np.array(state)
        
env.close()

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x0000028FE2E9BE50>
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


KeyboardInterrupt: 