In [18]:
import gym
import numpy as np
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import mse

In [28]:
class DQNAgent:
    #define hyper parameter
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95 #reward discount rate
        self.epsilon = 1.0 #exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    #build Deep Q learning model
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model
    
    
    #to explore new action, or exploit the action that lead to maxmize 
    def act(self, state):
        # explore
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        #return action that lead to biggest predicted reward
        act_values = self.model.predict(state)
        return np.argmax(act_values[0]) 
        
        
        
    #create replay memory
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
   
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            #calculate the target Q values
            if done == True:
                target_q = reward 
            else:
                target_q = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
        
            
            #calculate the output Q values
            output_q = self.model.predict(state)
            #replace with target action_state pair
            output_q[0][action] = target_q
    
            #fit the weights to the data
            self.model.fit(state, output_q, epochs = 1, verbose = 0)
        
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
        
        
        
        

Training the DQN agent

In [None]:
if __name__ == "__main__":
    #initialize environment and the agent
    env = gym.make('CartPole-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    done = False
    batch_size = 32
    
    
    #iterate the game
    for e in range(5000):
        #reset state 
        state = env.reset()
        state = np.reshape(state,[1,4])
    
        #iterate the time step
        for time_step in range(500):
            #env.render()
            action = agent.act(state)
            
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            #put experience to the replay memory
            agent.remember(state, action, reward, next_state, done)
            
            #make next state the current state
            state = next_state
            
            if done:
                print("episode: {}/{}, score: {}".format(e, 5000, time_step))
                break
            
            if len(agent.memory) > batch_size: 
                agent.replay(batch_size)
                
            
            
        
        
        

[2019-06-12 14:53:41,206] Making new env: CartPole-v0


episode: 0/5000, score: 17
episode: 1/5000, score: 14
episode: 2/5000, score: 11
episode: 3/5000, score: 9
episode: 4/5000, score: 9
episode: 5/5000, score: 9
episode: 6/5000, score: 8
episode: 7/5000, score: 9
episode: 8/5000, score: 8
episode: 9/5000, score: 8
episode: 10/5000, score: 8
episode: 11/5000, score: 10
episode: 12/5000, score: 9
episode: 13/5000, score: 7
episode: 14/5000, score: 9
episode: 15/5000, score: 8
episode: 16/5000, score: 8
episode: 17/5000, score: 8
episode: 18/5000, score: 8
episode: 19/5000, score: 60
episode: 20/5000, score: 62
episode: 21/5000, score: 85
episode: 22/5000, score: 139
episode: 23/5000, score: 47
episode: 24/5000, score: 53
episode: 25/5000, score: 30
episode: 26/5000, score: 53
episode: 27/5000, score: 36
episode: 28/5000, score: 39
episode: 29/5000, score: 88
episode: 30/5000, score: 41
episode: 31/5000, score: 25
episode: 32/5000, score: 36
episode: 33/5000, score: 95
episode: 34/5000, score: 28
episode: 35/5000, score: 44
episode: 36/5000