In [1]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()
       
 #      if self.load_model:
 #            self.model.load_weights("./save_model/cartpole_dqn.h5")
              
       

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input) #update_input == s_t =~current_state
        target_val = self.target_model.predict(update_target) #Q_sa == target_val, update_target = s_t1 =~ next_state

        for i in range(self.batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)

In [3]:


if __name__ == "__main__":
    # In case of CartPole-v1, maximum length of episode is 500
    env = gym.make('CartPole-v1')
    # get size of state and action from environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)

    scores, episodes = [], []
    
    EPISODES = 300

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            
            env.render()

            # get action for the current state and go one step in environment
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            # if an action make the episode end, then gives penalty of -100
            reward = reward if not done or score == 499 else -100

            # save the sample <s, a, r, s'> to the replay memory
            agent.append_sample(state, action, reward, next_state, done)
            # every time step do the training
            agent.train_model()
            score += reward
            state = next_state

            if done:
                # every episode update the target model to be same with model
                agent.update_target_model()

                # every episode, plot the play time
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
               # pylab.savefig("./save_graph/cartpole_dqn.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

                # if the mean of scores of last 10 episode is bigger than 490
                # stop training
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    sys.exit()

        # save the model
 #       if e % 50 == 0:
 #           agent.model.save_weights("./save_model/cartpole_dqn.h5")



[2017-12-27 21:04:26,488] Making new env: CartPole-v1


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_6 (Den

('episode:', 74, '  score:', 8.0, '  memory length:', 1052, '  epsilon:', 0.3490547993433876)
('episode:', 75, '  score:', 13.0, '  memory length:', 1066, '  epsilon:', 0.3441996694320795)
('episode:', 76, '  score:', 8.0, '  memory length:', 1075, '  epsilon:', 0.34111423472584396)
('episode:', 77, '  score:', 10.0, '  memory length:', 1086, '  epsilon:', 0.33738068325533116)
('episode:', 78, '  score:', 12.0, '  memory length:', 1099, '  epsilon:', 0.3330209538162239)
('episode:', 79, '  score:', 8.0, '  memory length:', 1108, '  epsilon:', 0.3300357260543739)
('episode:', 80, '  score:', 8.0, '  memory length:', 1117, '  epsilon:', 0.32707725812456445)
('episode:', 81, '  score:', 10.0, '  memory length:', 1128, '  epsilon:', 0.323497343674428)
('episode:', 82, '  score:', 13.0, '  memory length:', 1142, '  epsilon:', 0.3189977016914014)
('episode:', 83, '  score:', 12.0, '  memory length:', 1155, '  epsilon:', 0.3148755223844758)
('episode:', 84, '  score:', 24.0, '  memory length:

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
env.render()


In [21]:
#feelcomfortable
import gym
import random
import numpy as np
from collections import deque
env = gym.make('CartPole-v1')
state = env.reset()
print state
print state.shape
state = np.reshape(state, [1, state.size])
print state
print state.shape
print state.size
print env.observation_space
print env.observation_space.shape
print env.observation_space.shape[0]
action = random.randrange(env.action_space.n)
next_state, reward, done, info = env.step(action)
print next_state
next_state = np.reshape(state, [1, next_state.size])
print next_state

memory = deque(maxlen=5)
memory.append((state, action, reward, next_state, done))
print memory

[2017-12-27 18:03:42,276] Making new env: CartPole-v1


[ 0.01314833  0.00989805  0.0278539   0.03632513]
(4,)
[[ 0.01314833  0.00989805  0.0278539   0.03632513]]
(1, 4)
4
Box(4,)
(4,)
4
[ 0.0133463   0.20460973  0.0285804  -0.2474411 ]
[[ 0.01314833  0.00989805  0.0278539   0.03632513]]
deque([(array([[ 0.01314833,  0.00989805,  0.0278539 ,  0.03632513]]), 1, 1.0, array([[ 0.01314833,  0.00989805,  0.0278539 ,  0.03632513]]), False)], maxlen=5)
