# Deep Q-Learning Cartpole

Using what I learned in Lazy Programmer's Udemy class Deep Reinforcement Learning in Python, I'm going to try and create my own DQN that can solve OpenAI's Cartpole problem.



In [9]:
import gym
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import SGD, Adam, RMSprop
import random
from collections import deque

In [2]:
# Create DQN class.
class Agent:
    def __init__(self, state_num, action_num):
        self.action_num = action_num
        self.gamma = 0.95 # Discount rate.
        self.batch = 64 # Number of replayed experiences batched together.
        self.experiences = deque(maxlen = 2000)
        
        # Create Neural Network that will act as a function
        # approximator for Q[s,a].
        self.model = Sequential()
        # First dense layer with relu activation.
        self.model.add(Dense(24, input_dim=state_num))
        self.model.add(Activation('relu'))
        # Second dense layer with relu activation.
        self.model.add(Dense(24))
        self.model.add(Activation('relu'))
        # Output layer with linear activation.
        self.model.add(Dense(self.action_num))
        self.model.add(Activation('linear'))
        # Loss function.
        self.model.compile(loss = 'mse', optimizer=Adam(lr = 0.001),
              metrics=['categorical_accuracy'])
        
        self.model.summary()
        
    def ExploitAction(self, state):
        action_values = self.model.predict(state)
        exploit_action_index = np.argmax(action_values[0])
        return exploit_action_index
        
    def ExploreAction(self):
        return np.random.choice(self.action_num,1)[0]
    
    def AddExperience(self, s, a, r, s2, episode_over):
        self.experiences.append((s, a, r, s2, episode_over))
    
    def UpdateQ(self, s, a, r, s2, episode_over):
        if (len(self.experiences) <= self.batch-1):
            self.experiences.append((s, a, r, s2, episode_over))
            return
        # Update not only current state but randomly
        # selected experiences to replay.
        experience_replay = random.sample(self.experiences, self.batch-1)
        experience_replay.append((s, a, r, s2, episode_over))
        
        # Add to all experiences.
        self.experiences.append((s, a, r, s2, episode_over))
        
        for s_t0, a_t0, r_t0, s_t1, over in experience_replay:
            if (not over):
                a_t1_values = self.model.predict(s_t1)[0]
                a_t1_value = np.amax(a_t1_values)
                target = r_t0 + self.gamma*a_t1_value
            else:
                target = r_t0
                
            # Should only change action that was used.
            target_all = self.model.predict(s_t0)
            target_all[0][a_t0] = target
            
            # Update the network.
            self.model.fit(s_t0, target_all, epochs=1, verbose=0)

In [3]:
def PlayEpisode(env, agent, epsilon):
    # Reset playing environment.
    s_t0 = env.reset()
    s_t0 = np.reshape(s_t0, [1, 4])
    
    total_episode_reward = 0
    time_steps = 0
    episode_over = False
    
    while (not episode_over):
        # Determine whether to explore or exploit.
        if (np.random.random() < epsilon):
            # Explore.
            a_t0 = agent.ExploreAction()
        else:
            # Exploit.
            a_t0 = agent.ExploitAction(s_t0)

        # Perform action and move to next state.
        s_t1, reward, episode_over, info = env.step(a_t0)
        time_steps += 1
        
        total_episode_reward += reward

        if episode_over and (time_steps < 500):
            reward -= 300
            
        s_t1 = np.reshape(s_t1, [1, 4])
        
        # Update Q.
        agent.UpdateQ(s_t0, a_t0, reward, s_t1, episode_over)
        
        # t1 becomes t0.
        s_t0 = s_t1
        
    return total_episode_reward     

## Create Agent and Environment

In [6]:
environment = gym.make('CartPole-v1')

rl_agent = Agent(environment.observation_space.shape[0],
                 environment.action_space.n)

[2017-08-18 21:52:30,038] Making new env: CartPole-v1


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
activation_4 (Activation)    (None, 24)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
activation_5 (Activation)    (None, 24)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 50        
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


## Run Agent

In [11]:
episode_num = 100
all_episode_r = np.zeros(episode_num)
max_episode_r = 0
best_100_avg = 0

for i in range(episode_num):
    # Reduce epsilon over time.
    eps = 1.0/np.sqrt(i+1)
    
    # Play episode.
    ep_reward = PlayEpisode(environment, rl_agent, eps)
    print(i, ep_reward)

0 11.0
1 25.0
2 87.0
3 20.0
4 61.0


KeyboardInterrupt: 