In [1]:
import random

import gym
import numpy as np
import tensorflow as tf

In [2]:
env = gym.make("CartPole-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
num_input_neurons = len(env.reset())
num_ouptut_neurons = env.action_space.n
hidden_layer_dimensions = [16, 32, 32]

learning_rate = 0.0001

gamma = 0.95

# training logic configurations
max_num_episodes = 700
buffer_size = 10000
mini_batch_size = 64
steps_per_target_update = 7
consecutive_successful_episodes_to_stop = 20
min_average_reward_for_stopping = 195

In [4]:
class DQN:
    def __init__(self,
                 session,
                 scope_name,
                 input_size,
                 hidden_layer_sizes,
                 output_size,
                 learning_rate):
        
        self.session = session
        self.scope_name = scope_name
        self.input_size = input_size
        self.hidden_layer_sizes = hidden_layer_sizes
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(self.scope_name):
            self.input = tf.placeholder(shape=[None, self.input_size],
                                        dtype=tf.float32)

            net = self.input
            for layer_num, layer_size in enumerate(self.hidden_layer_sizes):
                net = tf.layers.dense(net,
                                      layer_size,
                                      activation=tf.nn.relu)

            self.output = tf.layers.dense(net,
                                          self.output_size)

            # Placeholder for expected q-values
            self.y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)

            # Using the loss method provided by tf directly
            self.loss = tf.losses.mean_squared_error(self.y, self.output)

            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate).minimize(self.loss)
    
    def predict(self, state):
        return self.session.run(self.output,
                                feed_dict={self.input: np.reshape(state, [-1, self.input_size])})
    
    def update(self, state, y):
        return self.session.run([self.loss, self.optimizer],
                              feed_dict={
                                    self.input: state,
                                    self.y: y
                              })
    
    @staticmethod
    def create_copy_operations(source_scope, dest_scope):
        source_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=source_scope)
        dest_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=dest_scope)
        
        assert len(source_vars) == len(dest_vars)
        result = []
        
        for source_var, dest_var in zip(source_vars, dest_vars):
            result.append(dest_var.assign(source_var.value()))
        
        return result

In [5]:
def train_dqn(main_dqn, target_dqn, mini_batch):
    """
    param: mini_batch:  A list of experiences in the form of
                       `(state, action, reward, next_state, done)`
    """
    states = [x[0] for x in mini_batch]
    actions = [x[1] for x in mini_batch]
    rewards = [x[2] for x in mini_batch]
    next_states = [x[3] for x in mini_batch]
    done = [x[4] for x in mini_batch]
    
    states = np.vstack(states)
    next_states = np.vstack(next_states)
    
    actions = np.array(actions)
    rewards = np.array(rewards)
    done = np.array(done)
    
    target_output = target_dqn.predict(next_states)
    target_q_vals = rewards + gamma * np.max(target_output, axis=1) * (1 - done)
    
    main_output = main_dqn.predict(states)
    main_output[np.arange(len(states)), actions] = target_q_vals
    final_target_q_values = main_output
    
    loss, optimizer = main_dqn.update(states, final_target_q_values)
    
    return loss

In [7]:
from collections import deque

replay_buffer = deque(maxlen=buffer_size)
last_n_rewards = deque(maxlen=consecutive_successful_episodes_to_stop)

with tf.Session() as sess:
    main_dqn = DQN(session=sess,
                   scope_name="q_main",
                   input_size=num_input_neurons,
                   hidden_layer_sizes=hidden_layer_dimensions,
                   output_size=num_ouptut_neurons,
                   learning_rate=learning_rate)
    
    target_dqn = DQN(session=sess,
                     scope_name="q_target",
                     input_size=num_input_neurons,
                     hidden_layer_sizes=hidden_layer_dimensions,
                     output_size=num_ouptut_neurons,
                     learning_rate=learning_rate)
    
    sess.run(tf.global_variables_initializer())

    # Make them identical to begin with
    sess.run(DQN.create_copy_operations("q_main", "q_target"))
    
    for ep_num in range(max_num_episodes):
               
        state = env.reset()
        done = False
        episode_reward, steps = 0, 0
        
        # epsilon decay
        epsilon = 1. /((ep_num / 10) + 1)
        
        while not done:
            # select the action
            action = None
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(main_dqn.predict(state))
            
            # execute the action
            next_state, reward, done, _ = env.step(action)
            
            if done:
                reward = -1
            
            # add experience to the buffer
            replay_buffer.append((state, action, reward, next_state, done))
            
            # sample from the buffer and train
            if len(replay_buffer) > mini_batch_size:
                mini_batch = random.sample(replay_buffer, mini_batch_size)
                # Feeds samples to the networks, computes the loss, and updates
                # the weights
                loss = train_dqn(main_dqn, target_dqn, mini_batch)
            
            # Copy weights every `steps_per_target_update` iterations
            if steps % steps_per_target_update == 0:
                sess.run(DQN.create_copy_operations("q_main", "q_target"))    
               
            episode_reward += reward
            steps += 1
            state = next_state
            

        last_n_rewards.append(episode_reward)
        last_n_avg_reward = np.mean(last_n_rewards)
        
        # Stopping criteria
        if len(last_n_rewards) == consecutive_successful_episodes_to_stop \
            and last_n_avg_reward > min_average_reward_for_stopping:
                print("Solved after {} epsiodes".format(ep_i))
                break

Solved after 348 episodes
