# Deep Q-Network method

In DQN, the Q-learning algorithm is modified such that the Q-values are estimated using a neural network. The input to the neural network is the state and the output is the Q-values of all the actions at the state. This code has been written using the following tutorial as a baseline for further modifications: https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c and closely following the pseudocode found here: https://towardsdatascience.com/introduction-to-various-reinforcement-learning-algorithms-i-q-learning-sarsa-dqn-ddpg-72a5e0cb6287.

In [None]:
# Import the environment and set the goal
from ipynb.fs.full.PianoHandv1 import *
train_key = 'C'

In [None]:
import time
import gym
import random
import os
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD

env = PianoHandEnv(train_key)

state_space        = env.observation_space.shape[0]   # State space of the environment 
action_space       = env.action_space.n               # Action space of the environment 
hidden_layers      = 16                               # Hidden layers of neural network
total_episodes     = 10
max_steps          = 100
sample_batch_size  = 32                               # Batch size for experience replay
memory             = deque(maxlen=128)                # Memory for experience replay
learning_rate      = 0.01                             # Learning rate - the amount of change to the model during each step         
gamma              = 0.95                             # Discount rate - how important are the immediate rewards vs later rewards
epsilon            = 0.9                              # Probability of exploring vs exploiting
exploration_rate   = 1.0
exploration_min    = 0.01
exploration_decay  = 0.995
reward_array       = []
time_array         = []

# This function builds the neural network model which consists of fully connected layers with the input as the states and 
# the outputs as the Q-values of different actions at the state.
def build_model(state_size, action_size):
    model = Sequential()
    inputs = Input(shape=(state_space,))
    layer1 = Dense(hidden_layers, activation = 'relu')(inputs)
    layer2 = Dense(action_space, activation = 'linear')(layer1)
    model = keras.Model(inputs=inputs, outputs=layer2)
    model.compile(loss='mse', optimizer=SGD(lr=learning_rate))
    model.summary()
    
    return model

# This function generates the action chosen using the epsilon greedy policy.
def e_greedy(state, model):
    # If the random number generated is smaller than epsilon, choose to explore, otherwise take best action.
    if np.random.uniform(0,1) < epsilon:          
        action = env.action_space.sample()                     # Exploration 
    else:
        action = np.argmax((model.predict(state))[0])          # Exploitation 
    return action

def experience_replay(sample_batch_size):
    sample_batch = random.sample(memory, sample_batch_size)                 # Sample random minibatch
    for state, action, reward, next_state, done in sample_batch:
        if done:
            target = reward                                                 
        if not done:
            target = reward + gamma * np.amax(model.predict(next_state)[0])
        
        # Perform gradient descent 
        target_new = model.predict(state)                                     
        target_new[0][action] = target
        model.fit(state, target_new, epochs=1, verbose=0)

        
# Build model
model = build_model(state_space, action_space)

# Main loop to be run over all episodes
for episode in range(total_episodes):
    ep_start = time.time()
    state = env.reset()
    state = np.reshape(state, [1, state_space])
    done = False
    index = 0
    for step in range(max_steps):
        action = e_greedy(state, model)
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_space])
        
        # Store the transition in memory
        memory.append((state, action, reward, next_state, done))  
        state = next_state
        index += 1
        reward_array.append(reward)
        
        # Experience replay to sample a random batch and update target
        if len(memory) >= sample_batch_size:
            experience_replay(sample_batch_size)
        
        # Decaying epsilon greedy
        if exploration_rate > exploration_min:
            exploration_rate *= exploration_decay
    
    # Keep track of time
    ep_end = time.time()
    time_diff = ep_end - ep_start
    time_array.append(time_diff)
    
    print("Episode {} Average rewards = {}". format(episode, np.mean(reward_array)), end = '\r')

# Save last best model
# weight_path = "./" + train_key + ".h5"
# model.save_weights(weight_path)

## Suboptimal training times
The training of the agent using this algorithm took very long hours, with sub-optimal results, even after reducing the state and action space, and tuning the parameters. Actor-Critic method produced better results with similar conditions, and hence, Actor-Critic was chosen as the algorithm to move forward with.
The following code was used to plot the average time taken per episode for training a few episodes.

In [None]:
plt.figure()  
plt.plot(time_diff)
plt.ylabel("Time taken per episode")
plt.xlabel("Episode")
plt.title("Time taken per episode vs episodes")