In [17]:
%matplotlib inline

import gym

import itertools
import numpy as np
import random
import tensorflow as tf
import os
import sys

from gym.wrappers import Monitor
from collections import deque, namedtuple
from lib.plotting import EpisodeStats

In [18]:
ENV = gym.envs.make("Breakout-v0")

In [19]:
class StateProcessor:
    TF_SCOPE_NAME = 'state_processor'
    
    def __init__(self):
        with tf.variable_scope(StateProcessor.TF_SCOPE_NAME):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(self.output, [84, 84], tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)
    
    def process(self, session, state):
        return session.run(self.output, {self.input_state: state})

In [52]:
class Estimator:
    def __init__(self, valid_actions, scope='estimator', summaries_dir=None):
        self.valid_actions = valid_actions
        self.scope = scope
        self.summary_writer = None
        with tf.variable_scope(scope):
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, 'summaries_{}'.format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

    def _build_model(self):
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name='X')
        self.Y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name='Y')
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name='actions')

        X = tf.to_float(self.X_pl) / 255.
        batch_size = tf.shape(self.X_pl)[0]

        conv1 = tf.contrib.layers.conv2d(X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(conv2, 64, 3, 1, activation_fn=tf.nn.relu)
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)

        self.predictions = tf.contrib.layers.fully_connected(fc1, self.valid_actions.n)
        
        gather_indices = tf.range(batch_size) * self.valid_actions.n + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        self.losses = tf.squared_difference(self.Y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_optimization = self.optimizer.minimize(self.loss, global_step=tf.train.get_global_step())

        self.summaries = tf.summary.merge([
            tf.summary.scalar('loss', self.loss),
            tf.summary.histogram('loss_hist', self.losses),
            tf.summary.histogram('q_values_hist', self.predictions),
            tf.summary.scalar('max_q_value', tf.reduce_max(self.predictions))
        ])

    def predict(self, session, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 84, 84]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return session.run(self.predictions, {self.X_pl: s})

    def update(self, session, s, a, targets):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 84, 84]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = {self.X_pl: s, self.Y_pl: targets, self.actions_pl: a}
        summaries, global_step, _, loss = session.run(
            [self.summaries, tf.train.get_global_step(), self.train_optimization, self.loss],
            feed_dict
        )
        
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        
        return loss

In [53]:
tf.reset_default_graph()
global_step = tf.Variable(0, name='global_step', trainable=False)

e = Estimator(ENV.action_space, scope='test')
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    s = ENV.reset()
    s_p = sp.process(sess, s)
    s = np.stack([s_p] * 4, axis=2)
    s = np.array([s] * 2)
    
    print(e.predict(sess, s))
    
    targets = np.array([10., 10.])
    a = np.array([1, 3])
    print(e.update(sess, s, a, targets))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[0.         0.00090302 0.         0.02340581]
 [0.         0.00090302 0.         0.02340581]]
99.75719


In [54]:
def copy_model_parameters(session, source, target):
    params_1 = [v for v in tf.trainable_variables() if v.name.startwith(source.scope)]
    params_1 = sorted(params_1, key=lambda v: v.name)
    params_2 = [v for v in tf.trainable_variables() if v.name.startwith(target.scope)]
    params_2 = sorted(params_2, key=lambda v: v.name)
    
    update_operations = []
    for v1, v2 in zip(params_1, params_2):
        update_operations.append(v2.assign(v1))
        
    session.run(update_operations)

In [55]:
def make_epsilon_greedy_policy(estimator, nA):
    def policy_function(session, s, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(session, np.expand_dims(s, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_function

In [None]:
def dqn(session, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir,
        replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000,
        discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32,
        record_video_every=50):
    Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])

    replay_memory = []

    stats = EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes)
    )

    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.train.get_global_step())
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    policy = make_epsilon_greedy_policy(q_estimator, env.action_space.n)

    print("Populating replay memory...")
    s = env.reset()
    s = state_processor.process(sess, s)
    s = np.stack([s] * 4, axis=2)
    for i in range(replay_memory_init_size):
        a_probs = policy(sess, s, epsilons[0])
        a = np.random.choice(np.arange(len(a_probs)), p=a_probs)
        s_prime, reward, done, _ = env.step(a)
        s_prime = state_processor.process(sess, s_prime)
        s_prime = np.stack([s, s, s, s_prime], axis=2)
        replay_memory.append(Transition(state=s, action=a, reward=reward, next_state=s_prime, done=done))
    
    env = Monitor(env, directory=monitor_path, resume=True, 
                  video_callable=lambda count: count % record_video_every == 0)
    
    for i_episode in range(num_episodes):
        saver.save(tf.get_default_session(), checkpoint_path)
        
        s = env.reset()
        s = state_processor.process(sess, s)
        s = np.stack([s] * 4, axis=2)
        loss = None
        
        for t in itertools.count():
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag='epsilon')
            
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()
            
            a_probs = policy(sess, s, epsilon)
            a = np.random.choice(np.arange(len(a_probs)), p=a_probs)
            s_prime, reward, done, _ = env.step(a)
            s_prime = state_processor.process(sess, s_prime)
            s_prime = np.stack([s[1], s[2], s[3], s_prime], axis=2)
            
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)
            replay_memory.append(Transition(state=s, action=a, reward=reward, next_state=s_prime, done=done))
            
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            transition_sample = np.random.choice(replay_memory, batch_size)
            state_batch = np.array([t.state for t in transition_sample])
            action_batch = np.array([t.action for t in transition_sample])
            targets = target_estimator.predict(sess, state_batch)
            
            for i in range(len(targets)):
                if transition_sample[i].done:
                    targets[i] = reward
                else:
                    targets[i] = reward + discount_factor * targets[i]
            
            q_estimator.update(sess, state_batch, action_batch, targets)
            
            if done:
                break
            
            s = s_prime
            total_t += 1
