# CART-POLE BALANCING THROUGH DQN
### Author: Suhan Shetty , suhan.n.shetty@gmail.com

In [1]:
# Import the dependencies
import tensorflow as tf      
import numpy as np          
import random                                
from collections import deque 
import os 
import random
import gym
import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 

In [2]:
# Instantiate the cart-pole environment
env = gym.make('CartPole-v0') 

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
# The state and action space:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print("Size of State Space: ", state_size)
print("Number of Actions: ", action_size)

Size of State Space:  4
Number of Actions:  2


In [4]:
output_dir = "./cartpole_model"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [5]:
# Neural Network Architecture:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # create the placeholders
            self.inputs_ = tf.placeholder(tf.float32, [None, 1, self.state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 1, self.action_size], name="actions_")
            
            # target_Q = R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None, 1, self.action_size], name="target")
                        
            
            self.fc1 = tf.layers.dense(inputs = self.inputs_,
                                  units = 24,
                                  activation = tf.nn.relu,
                                 name="fc1")
            
            self.fc2 = tf.layers.dense(inputs = self.fc1,
                                  units = 24,
                                  activation = tf.nn.relu,
                                  name="fc2")
            
            self.output = tf.layers.dense(inputs = self.fc2,
                                    units = 2, 
                                    activation='linear', 
                                    name="output")

            # Q is our predicted Q value.
            self.Q_reduced = tf.reduce_sum(tf.multiply(self.output, self.actions_),axis=2)
            self.target_Q_reduced = tf.reduce_sum(tf.multiply(self.target_Q, self.actions_),axis=2)
            self.square_loss = tf.square(self.target_Q_reduced - self.Q_reduced)
            self.loss = tf.reduce_mean(self.square_loss)
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [6]:
# Replay Memory
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [7]:
"""
With ϵ select a random action, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(decay_rate, decay_step, state, action_size):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    epsilon_init = 1
    epsilon_final = 0.01
    epsilon_ = max(epsilon_init - decay_rate*decay_step, epsilon_final) #explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    action = np.array([0,0])
    
    if np.random.rand() < epsilon_:
        # Make a random action (exploration)
        action_choice = random.randrange(action_size)
        action[action_choice] = 1

    else:
        # Get action from Q-network (exploitation)
        # Estimate the Q values 
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
       
    # Take the biggest Q value (= the best action)
        action_choice = np.argmax(Qs) # 0 or 1
        action[action_choice] = 1

    return action, epsilon_

In [8]:
### MEMORY HYPERPARAMETERS
memory_size = 2000 # Number of experiences the Memory can keep

# Instantiate memory
memory = Memory(max_size = memory_size)

### MODEL HYPERPARAMETERS
            
learning_rate =  0.001 #0.0001 works well

### TRAINING HYPERPARAMETERS
total_episodes = 2000
episode_size = 1000 
batch_size = 32         

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0  # exploration probability at start
explore_stop = 0.01 # minimum exploration probability 
decay_rate = 0.00005 # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.95 # Discounting rate

print(memory_size)

2000


In [9]:
# Reset the graph
tf.reset_default_graph()
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

In [10]:
saver = tf.train.Saver() # Need this to save our model
with tf.Session() as sess:
  
    # Initialize the variables
    sess.run(tf.global_variables_initializer())
    #saver.restore(sess, "./cartpole_model/model.ckpt")
   
    # Initialize the decay rate (that will use to reduce epsilon) 
    decay_step = 0
    
    for episode in range(total_episodes):
        #Initialize the environment
        state = env.reset()
        state = state.reshape(1 ,state_size)
        done = False
        step_ = 0
        
        # Run an episode
        while ((not done) and (step_<1000)) :
            step_ += 1
            decay_step += 1
            env.render()
            # Predict which action to apply with exploration and explotation
            action, explore_probability = predict_action(decay_rate, decay_step, state, action_size)
            next_state, reward, done, _ = env.step(action[1])
            next_state = next_state.reshape(1 ,state_size)
            action = action.reshape(1 ,action_size)
            # Compute the reward: +1 for each additional frame with pole upright, otherwise -10
            reward = reward if not done else -10  
            obs = (state,action,reward,next_state,done)
            # Add the observation to memory
            memory.add(obs)
            state = np.copy(next_state)
            
            if done: # episode ends if agent drops pole or we reach timestep 1000
            # print the episode's score and agent's epsilon
                print("episode: {}, score: {}, epsilon: {:.2}".format(episode, step_, explore_probability))
                
                
        # Obtain random mini-batch from memory and update the weights
        if len(memory.buffer) > batch_size:
                batch = memory.sample(batch_size)
                # Storage for the elements in the batch
                targets_b = []
                states_b = []
                actions_b = []
                
                # Collect (s,a,r,s',end) data for training
                for state, action, reward, next_state, done in batch: # extract data for each minibatch sample
                    # Get Q(s,a) for all actions a in A(s)
                    Q_states = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape(( 1, *state.shape))})
                    
                    # Initialise the target value for the chosen (s,a)
                    Q_target = reward # if done (boolean whether game ended or not, i.e., whether final state or not), then target = reward
                    
                    # Initialise the target for all actions (s,a) a in A(s)-action space
                    target_Qs = np.copy(Q_states) #approximately map current state to future discounted reward
                    
                    if not done: # if next state is not terminal, then predict future discounted reward
                        # Get Q(s',a') for all a' in A(s') - action space
                        Q_next = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_state.reshape(( 1, *next_state.shape))})
                        # (target) = reward + gamma*max(Q(s',a'), a' in is A(s))
                        Q_target = reward + gamma * np.amax(Q_next) 
                        
                    
                   
                    action_choice = action[0].tolist().index(1)
                    action = np.reshape(action,[1,action_size])
                    target_Qs[0][0][action_choice] = Q_target 
                    targets_b.append(target_Qs)
                    states_b.append(state)
                    actions_b.append(action)


                states_batch = np.array([each for each in states_b])
                actions_batch = np.array([each for each in actions_b])
                targets_batch = np.array([each for each in targets_b])
                states_batch = np.reshape(states_batch,[batch_size,1,state_size])
                actions_batch = np.reshape(actions_batch,[batch_size,1,action_size])
                targets_batch = np.reshape(targets_batch,[batch_size,1,action_size])
                
                # Update weights ti minimize loss
                loss,output,Q,target_Q,target_Q_reduced, _ = sess.run([DQNetwork.loss,DQNetwork.output,DQNetwork.Q_reduced,DQNetwork.target_Q,DQNetwork.target_Q_reduced, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_batch,
                                               DQNetwork.target_Q: targets_batch,
                                               DQNetwork.actions_: actions_batch})

        if episode%10==0 and episode > batch_size :
            save_path = saver.save(sess, "./cartpole_model/model.ckpt")
            print('Episode: {}'.format(episode),
                          'Training loss: {:.4f}'.format(loss),
                          'Explore Prob: {:.4f}'.format(explore_probability))
            
    print("Training is done")


episode: 0, score: 11, epsilon: 1.0
episode: 1, score: 13, epsilon: 1.0
episode: 2, score: 12, epsilon: 1.0
episode: 3, score: 11, epsilon: 1.0
episode: 4, score: 25, epsilon: 1.0
episode: 5, score: 18, epsilon: 1.0
episode: 6, score: 28, epsilon: 0.99
episode: 7, score: 19, epsilon: 0.99
episode: 8, score: 24, epsilon: 0.99
episode: 9, score: 22, epsilon: 0.99
episode: 10, score: 18, epsilon: 0.99
episode: 11, score: 26, epsilon: 0.99
episode: 12, score: 10, epsilon: 0.99
episode: 13, score: 22, epsilon: 0.99
episode: 14, score: 31, epsilon: 0.99
episode: 15, score: 17, epsilon: 0.98
episode: 16, score: 34, epsilon: 0.98
episode: 17, score: 14, epsilon: 0.98
episode: 18, score: 10, epsilon: 0.98
episode: 19, score: 26, epsilon: 0.98
episode: 20, score: 18, epsilon: 0.98
episode: 21, score: 41, epsilon: 0.98
episode: 22, score: 18, epsilon: 0.98
episode: 23, score: 14, epsilon: 0.98
episode: 24, score: 21, epsilon: 0.97
episode: 25, score: 10, epsilon: 0.97
episode: 26, score: 41, epsi

episode: 191, score: 19, epsilon: 0.78
episode: 192, score: 15, epsilon: 0.78
episode: 193, score: 20, epsilon: 0.78
episode: 194, score: 15, epsilon: 0.78
episode: 195, score: 13, epsilon: 0.78
episode: 196, score: 23, epsilon: 0.78
episode: 197, score: 30, epsilon: 0.77
episode: 198, score: 14, epsilon: 0.77
episode: 199, score: 17, epsilon: 0.77
episode: 200, score: 17, epsilon: 0.77
Episode: 200 Training loss: 18.2467 Explore Prob: 0.7722
episode: 201, score: 34, epsilon: 0.77
episode: 202, score: 34, epsilon: 0.77
episode: 203, score: 18, epsilon: 0.77
episode: 204, score: 9, epsilon: 0.77
episode: 205, score: 13, epsilon: 0.77
episode: 206, score: 14, epsilon: 0.77
episode: 207, score: 18, epsilon: 0.77
episode: 208, score: 10, epsilon: 0.76
episode: 209, score: 17, epsilon: 0.76
episode: 210, score: 14, epsilon: 0.76
Episode: 210 Training loss: 8.2081 Explore Prob: 0.7631
episode: 211, score: 10, epsilon: 0.76
episode: 212, score: 16, epsilon: 0.76
episode: 213, score: 18, epsil

episode: 377, score: 28, epsilon: 0.56
episode: 378, score: 25, epsilon: 0.56
episode: 379, score: 17, epsilon: 0.56
episode: 380, score: 12, epsilon: 0.56
Episode: 380 Training loss: 0.8643 Explore Prob: 0.5609
episode: 381, score: 23, epsilon: 0.56
episode: 382, score: 13, epsilon: 0.56
episode: 383, score: 15, epsilon: 0.56
episode: 384, score: 14, epsilon: 0.56
episode: 385, score: 14, epsilon: 0.56
episode: 386, score: 15, epsilon: 0.56
episode: 387, score: 12, epsilon: 0.56
episode: 388, score: 11, epsilon: 0.56
episode: 389, score: 28, epsilon: 0.55
episode: 390, score: 14, epsilon: 0.55
Episode: 390 Training loss: 13.7650 Explore Prob: 0.5530
episode: 391, score: 13, epsilon: 0.55
episode: 392, score: 27, epsilon: 0.55
episode: 393, score: 17, epsilon: 0.55
episode: 394, score: 16, epsilon: 0.55
episode: 395, score: 13, epsilon: 0.55
episode: 396, score: 34, epsilon: 0.55
episode: 397, score: 16, epsilon: 0.55
episode: 398, score: 18, epsilon: 0.55
episode: 399, score: 11, epsi

episode: 562, score: 50, epsilon: 0.23
episode: 563, score: 46, epsilon: 0.23
episode: 564, score: 33, epsilon: 0.23
episode: 565, score: 29, epsilon: 0.22
episode: 566, score: 26, epsilon: 0.22
episode: 567, score: 42, epsilon: 0.22
episode: 568, score: 54, epsilon: 0.22
episode: 569, score: 19, epsilon: 0.22
episode: 570, score: 37, epsilon: 0.22
Episode: 570 Training loss: 0.5330 Explore Prob: 0.2161
episode: 571, score: 28, epsilon: 0.21
episode: 572, score: 35, epsilon: 0.21
episode: 573, score: 41, epsilon: 0.21
episode: 574, score: 30, epsilon: 0.21
episode: 575, score: 28, epsilon: 0.21
episode: 576, score: 34, epsilon: 0.21
episode: 577, score: 40, epsilon: 0.2
episode: 578, score: 36, epsilon: 0.2
episode: 579, score: 44, epsilon: 0.2
episode: 580, score: 27, epsilon: 0.2
Episode: 580 Training loss: 0.4869 Explore Prob: 0.1989
episode: 581, score: 44, epsilon: 0.2
episode: 582, score: 43, epsilon: 0.19
episode: 583, score: 37, epsilon: 0.19
episode: 584, score: 33, epsilon: 0

episode: 746, score: 51, epsilon: 0.01
episode: 747, score: 92, epsilon: 0.01
episode: 748, score: 74, epsilon: 0.01
episode: 749, score: 77, epsilon: 0.01
episode: 750, score: 85, epsilon: 0.01
Episode: 750 Training loss: 11.7727 Explore Prob: 0.0100
episode: 751, score: 119, epsilon: 0.01
episode: 752, score: 88, epsilon: 0.01
episode: 753, score: 86, epsilon: 0.01
episode: 754, score: 80, epsilon: 0.01
episode: 755, score: 78, epsilon: 0.01
episode: 756, score: 76, epsilon: 0.01
episode: 757, score: 91, epsilon: 0.01
episode: 758, score: 66, epsilon: 0.01
episode: 759, score: 125, epsilon: 0.01
episode: 760, score: 71, epsilon: 0.01
Episode: 760 Training loss: 0.2258 Explore Prob: 0.0100
episode: 761, score: 190, epsilon: 0.01
episode: 762, score: 97, epsilon: 0.01
episode: 763, score: 77, epsilon: 0.01
episode: 764, score: 82, epsilon: 0.01
episode: 765, score: 71, epsilon: 0.01
episode: 766, score: 64, epsilon: 0.01
episode: 767, score: 46, epsilon: 0.01
episode: 768, score: 41, e

episode: 930, score: 99, epsilon: 0.01
Episode: 930 Training loss: 0.1973 Explore Prob: 0.0100
episode: 931, score: 96, epsilon: 0.01
episode: 932, score: 125, epsilon: 0.01
episode: 933, score: 120, epsilon: 0.01
episode: 934, score: 70, epsilon: 0.01
episode: 935, score: 54, epsilon: 0.01
episode: 936, score: 68, epsilon: 0.01
episode: 937, score: 43, epsilon: 0.01
episode: 938, score: 64, epsilon: 0.01
episode: 939, score: 53, epsilon: 0.01
episode: 940, score: 54, epsilon: 0.01
Episode: 940 Training loss: 0.3684 Explore Prob: 0.0100
episode: 941, score: 64, epsilon: 0.01
episode: 942, score: 89, epsilon: 0.01
episode: 943, score: 58, epsilon: 0.01
episode: 944, score: 89, epsilon: 0.01
episode: 945, score: 114, epsilon: 0.01
episode: 946, score: 200, epsilon: 0.01
episode: 947, score: 165, epsilon: 0.01
episode: 948, score: 181, epsilon: 0.01
episode: 949, score: 69, epsilon: 0.01
episode: 950, score: 63, epsilon: 0.01
Episode: 950 Training loss: 7.1511 Explore Prob: 0.0100
episode

episode: 1110, score: 200, epsilon: 0.01
Episode: 1110 Training loss: 18.1408 Explore Prob: 0.0100
episode: 1111, score: 200, epsilon: 0.01
episode: 1112, score: 166, epsilon: 0.01
episode: 1113, score: 123, epsilon: 0.01
episode: 1114, score: 81, epsilon: 0.01
episode: 1115, score: 87, epsilon: 0.01
episode: 1116, score: 73, epsilon: 0.01
episode: 1117, score: 76, epsilon: 0.01
episode: 1118, score: 66, epsilon: 0.01
episode: 1119, score: 57, epsilon: 0.01
episode: 1120, score: 48, epsilon: 0.01
Episode: 1120 Training loss: 0.2137 Explore Prob: 0.0100
episode: 1121, score: 41, epsilon: 0.01
episode: 1122, score: 41, epsilon: 0.01
episode: 1123, score: 40, epsilon: 0.01
episode: 1124, score: 34, epsilon: 0.01
episode: 1125, score: 37, epsilon: 0.01
episode: 1126, score: 33, epsilon: 0.01
episode: 1127, score: 39, epsilon: 0.01
episode: 1128, score: 43, epsilon: 0.01
episode: 1129, score: 45, epsilon: 0.01
episode: 1130, score: 33, epsilon: 0.01
Episode: 1130 Training loss: 45.5472 Expl

episode: 1288, score: 200, epsilon: 0.01
episode: 1289, score: 200, epsilon: 0.01
episode: 1290, score: 200, epsilon: 0.01
Episode: 1290 Training loss: 0.1036 Explore Prob: 0.0100
episode: 1291, score: 200, epsilon: 0.01
episode: 1292, score: 200, epsilon: 0.01
episode: 1293, score: 200, epsilon: 0.01
episode: 1294, score: 200, epsilon: 0.01
episode: 1295, score: 200, epsilon: 0.01
episode: 1296, score: 200, epsilon: 0.01
episode: 1297, score: 200, epsilon: 0.01
episode: 1298, score: 200, epsilon: 0.01
episode: 1299, score: 200, epsilon: 0.01
episode: 1300, score: 200, epsilon: 0.01
Episode: 1300 Training loss: 0.0971 Explore Prob: 0.0100
episode: 1301, score: 200, epsilon: 0.01
episode: 1302, score: 200, epsilon: 0.01
episode: 1303, score: 200, epsilon: 0.01
episode: 1304, score: 200, epsilon: 0.01
episode: 1305, score: 200, epsilon: 0.01
episode: 1306, score: 145, epsilon: 0.01
episode: 1307, score: 113, epsilon: 0.01
episode: 1308, score: 82, epsilon: 0.01
episode: 1309, score: 71, 

episode: 1465, score: 200, epsilon: 0.01
episode: 1466, score: 118, epsilon: 0.01
episode: 1467, score: 111, epsilon: 0.01
episode: 1468, score: 200, epsilon: 0.01
episode: 1469, score: 134, epsilon: 0.01
episode: 1470, score: 98, epsilon: 0.01
Episode: 1470 Training loss: 16.3158 Explore Prob: 0.0100
episode: 1471, score: 89, epsilon: 0.01
episode: 1472, score: 66, epsilon: 0.01
episode: 1473, score: 90, epsilon: 0.01
episode: 1474, score: 72, epsilon: 0.01
episode: 1475, score: 85, epsilon: 0.01
episode: 1476, score: 82, epsilon: 0.01
episode: 1477, score: 72, epsilon: 0.01
episode: 1478, score: 73, epsilon: 0.01
episode: 1479, score: 95, epsilon: 0.01
episode: 1480, score: 73, epsilon: 0.01
Episode: 1480 Training loss: 4.9542 Explore Prob: 0.0100
episode: 1481, score: 99, epsilon: 0.01
episode: 1482, score: 86, epsilon: 0.01
episode: 1483, score: 200, epsilon: 0.01
episode: 1484, score: 132, epsilon: 0.01
episode: 1485, score: 200, epsilon: 0.01
episode: 1486, score: 200, epsilon: 0

episode: 1641, score: 200, epsilon: 0.01
episode: 1642, score: 200, epsilon: 0.01
episode: 1643, score: 190, epsilon: 0.01
episode: 1644, score: 200, epsilon: 0.01
episode: 1645, score: 200, epsilon: 0.01
episode: 1646, score: 200, epsilon: 0.01
episode: 1647, score: 200, epsilon: 0.01
episode: 1648, score: 200, epsilon: 0.01
episode: 1649, score: 200, epsilon: 0.01
episode: 1650, score: 200, epsilon: 0.01
Episode: 1650 Training loss: 0.0813 Explore Prob: 0.0100
episode: 1651, score: 200, epsilon: 0.01
episode: 1652, score: 200, epsilon: 0.01
episode: 1653, score: 200, epsilon: 0.01
episode: 1654, score: 200, epsilon: 0.01
episode: 1655, score: 200, epsilon: 0.01
episode: 1656, score: 200, epsilon: 0.01
episode: 1657, score: 200, epsilon: 0.01
episode: 1658, score: 200, epsilon: 0.01
episode: 1659, score: 200, epsilon: 0.01
episode: 1660, score: 200, epsilon: 0.01
Episode: 1660 Training loss: 0.1114 Explore Prob: 0.0100
episode: 1661, score: 200, epsilon: 0.01
episode: 1662, score: 200

episode: 1818, score: 200, epsilon: 0.01
episode: 1819, score: 200, epsilon: 0.01
episode: 1820, score: 200, epsilon: 0.01
Episode: 1820 Training loss: 0.0614 Explore Prob: 0.0100
episode: 1821, score: 200, epsilon: 0.01
episode: 1822, score: 200, epsilon: 0.01
episode: 1823, score: 200, epsilon: 0.01
episode: 1824, score: 200, epsilon: 0.01
episode: 1825, score: 200, epsilon: 0.01
episode: 1826, score: 200, epsilon: 0.01
episode: 1827, score: 200, epsilon: 0.01
episode: 1828, score: 200, epsilon: 0.01
episode: 1829, score: 200, epsilon: 0.01
episode: 1830, score: 200, epsilon: 0.01
Episode: 1830 Training loss: 0.1306 Explore Prob: 0.0100
episode: 1831, score: 200, epsilon: 0.01
episode: 1832, score: 200, epsilon: 0.01
episode: 1833, score: 200, epsilon: 0.01
episode: 1834, score: 200, epsilon: 0.01
episode: 1835, score: 200, epsilon: 0.01
episode: 1836, score: 200, epsilon: 0.01
episode: 1837, score: 200, epsilon: 0.01
episode: 1838, score: 200, epsilon: 0.01
episode: 1839, score: 200

episode: 1993, score: 200, epsilon: 0.01
episode: 1994, score: 200, epsilon: 0.01
episode: 1995, score: 200, epsilon: 0.01
episode: 1996, score: 200, epsilon: 0.01
episode: 1997, score: 200, epsilon: 0.01
episode: 1998, score: 200, epsilon: 0.01
episode: 1999, score: 200, epsilon: 0.01
Training is done
