# Simple DQN on Keras
On this example we will do a DQN (Without screen states) on the CartPole problem.

### References
* https://medium.com/mlreview/speeding-up-dqn-on-pytorch-solving-pong-in-30-minutes-81a1bd2dff55
* https://stackoverflow.com/questions/51960225/dqn-average-reward-decrease-after-training-for-a-period-of-time
* https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682

In [1]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
from IPython import display
from IPython.display import clear_output

# Create the Cart-Pole game environment
env = gym.make('CartPole-v0') 

### Define Hyperparameters

In [2]:
train_episodes = 600              # max number of episodes to learn from
max_steps = 200                   # max steps in an episode
gamma = 0.99                      # future reward discount

# Exploration parameters
explore_start = 1.0               # exploration probability at start (Pure exploratory)
explore_stop = 0.01               # minimum exploration probability
decay_rate = 0.0001               # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16                  # number of units in each Q-network hidden layer
learning_rate = 0.001             # Q-network learning rate

# Memory parameters
memory_size = 10000               # memory capacity
batch_size = 32                   # experience mini-batch size
pretrain_length = batch_size*10   # number experiences to pretrain the memory

consecutive_win_threshold = 8     # Number of Consecutive wins before stop training

### Define Q-Network

In [3]:
class QNetwork:
    def __init__(self, env, learning_rate=0.01, state_size=4,
                 hidden_size=10):
        action_size = env.action_space.n
        # state inputs to the Q-network
        self.model = Sequential()

        self.model.add(Dense(hidden_size, activation='relu',
                             input_dim=state_size))
        self.model.add(Dense(hidden_size, activation='relu'))
        self.model.add(Dense(action_size, activation='linear'))

        self.optimizer = Adam(lr=learning_rate)#, decay=1e-6)
        self.model.compile(loss='mse', optimizer=self.optimizer)
    
    def __call__(self, state):
        # Reshape states (,4) --> [1,4]
        state = np.expand_dims(state, axis=0)
        # Get Q-values
        Q_values = self.model.predict(state)[0]
        # Act greedly
        action = np.argmax(Q_values)
        return action


# Initialize DQN Network
mainQN = QNetwork(env, hidden_size=hidden_size, learning_rate=learning_rate)



### Memory Buffer

In [4]:
class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)),
                               size=batch_size,
                               replace=False)
        return [self.buffer[ii] for ii in idx]

memory = Memory(max_size=memory_size)

### Populate Replay Memory

In [5]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, 4])

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))

        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
        state = np.reshape(state, [1, 4])
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

### Training

In [6]:
step = 0
num_consecutive_win = 0

# For each episodes
for ep in range(1, train_episodes):
    total_reward = 0
    t = 0
    # For each iteration per episode.
    while t < max_steps:
        step += 1
     
        # Epsilon Greedy Decay
        explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step)
        
        # Epsilon Greedy
        if explore_p > np.random.rand():
            # Make a random action
            action = env.action_space.sample()
        else:
            # Get action from Q-network
            Qs = mainQN.model.predict(state)[0]
            action = np.argmax(Qs)

        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        total_reward += reward

        # Episode finish
        if done:
            # the episode ends so no next state
            next_state = np.zeros(state.shape)
            t = max_steps

            print('Episode: {}'.format(ep),
                  'Total reward: {}'.format(total_reward),
                  'Explore P: {:.4f}'.format(explore_p))
            
            # Early Stop
            if total_reward >= 199:
                num_consecutive_win += 1
                print('Consecutive Win...')
            else:
                num_consecutive_win = 0                
            
            if num_consecutive_win > consecutive_win_threshold:                
                break

            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())
            state = np.reshape(state, [1, 4])
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
            t += 1

        # Replay
        inputs = np.zeros((batch_size, 4))
        targets = np.zeros((batch_size, 2))

        # Sample experience from Replay Memory
        minibatch = memory.sample(batch_size)
        
        for i, (state_b, action_b, reward_b, next_state_b) in enumerate(minibatch):
            inputs[i:i+1] = state_b
            target = reward_b
            if not (next_state_b == np.zeros(state_b.shape)).all(axis=1):
                target_Q = mainQN.model.predict(next_state_b)[0]
                target = reward_b + gamma * np.amax(mainQN.model.predict(next_state_b)[0])
            targets[i] = mainQN.model.predict(state_b)
            targets[i][action_b] = target
        
        # Update the model (In each episode end)
        mainQN.model.fit(inputs, targets, epochs=1, verbose=0)
    
    # Stop Earlier ...
    if num_consecutive_win > consecutive_win_threshold:
        print('Number of consecutive wins high, stop earlier...')
        break

Episode: 1 Total reward: 9.0 Explore P: 0.9991
Episode: 2 Total reward: 13.0 Explore P: 0.9978
Episode: 3 Total reward: 69.0 Explore P: 0.9910
Episode: 4 Total reward: 17.0 Explore P: 0.9894
Episode: 5 Total reward: 41.0 Explore P: 0.9854
Episode: 6 Total reward: 24.0 Explore P: 0.9830
Episode: 7 Total reward: 15.0 Explore P: 0.9816
Episode: 8 Total reward: 18.0 Explore P: 0.9798
Episode: 9 Total reward: 21.0 Explore P: 0.9778
Episode: 10 Total reward: 33.0 Explore P: 0.9746
Episode: 11 Total reward: 34.0 Explore P: 0.9713
Episode: 12 Total reward: 18.0 Explore P: 0.9696
Episode: 13 Total reward: 11.0 Explore P: 0.9685
Episode: 14 Total reward: 33.0 Explore P: 0.9654
Episode: 15 Total reward: 17.0 Explore P: 0.9638
Episode: 16 Total reward: 30.0 Explore P: 0.9609
Episode: 17 Total reward: 10.0 Explore P: 0.9599
Episode: 18 Total reward: 20.0 Explore P: 0.9580
Episode: 19 Total reward: 51.0 Explore P: 0.9532
Episode: 20 Total reward: 15.0 Explore P: 0.9518
Episode: 21 Total reward: 37.0

Episode: 164 Total reward: 199.0 Explore P: 0.4258
Consecutive Win...
Episode: 165 Total reward: 199.0 Explore P: 0.4176
Consecutive Win...
Episode: 166 Total reward: 199.0 Explore P: 0.4095
Consecutive Win...
Episode: 167 Total reward: 136.0 Explore P: 0.4041
Episode: 168 Total reward: 199.0 Explore P: 0.3964
Consecutive Win...
Episode: 169 Total reward: 199.0 Explore P: 0.3888
Consecutive Win...
Episode: 170 Total reward: 199.0 Explore P: 0.3813
Consecutive Win...
Episode: 171 Total reward: 199.0 Explore P: 0.3740
Consecutive Win...
Episode: 172 Total reward: 197.0 Explore P: 0.3669
Episode: 173 Total reward: 199.0 Explore P: 0.3599
Consecutive Win...
Episode: 174 Total reward: 199.0 Explore P: 0.3530
Consecutive Win...
Episode: 175 Total reward: 172.0 Explore P: 0.3471
Episode: 176 Total reward: 199.0 Explore P: 0.3405
Consecutive Win...
Episode: 177 Total reward: 199.0 Explore P: 0.3340
Consecutive Win...
Episode: 178 Total reward: 199.0 Explore P: 0.3276
Consecutive Win...
Episode

### Test Learned Model

In [7]:
# Reset environment
state = env.reset()

while True:    
    # Act greedly
    action = mainQN(state)

    # Uncomment to render the visual state in a window
    env.render()

    # Step through environment using chosen action
    state, reward, done, _ = env.step(action)
    if done:
        break