In [1]:
import os
import gym
import tqdm
import pylab
import random
import numpy as np
import tensorflow as tf

from collections import deque
from tensorflow.contrib import slim

  return f(*args, **kwds)


## Task: fill empty spaces in the following agent code

In [2]:
class IFC:
    def __init__(self, name, inputs, outputs, lr):
        self.name = name
        self.inputs = inputs
        self.outputs = outputs
        self.lr = lr
        
        self._build()

    def fit(self, sess, x, y):
        sess.run(self.optimizer, feed_dict={self.x: x, self.y: y})
    
    def predict(self, sess, x):
        return sess.run(self.y_pred, feed_dict={self.x: x})
    
    def _build(self):
        self.x = tf.placeholder(tf.float32, [None, self.inputs])
        self.y = tf.placeholder(tf.float32, [None, self.outputs])
        with tf.variable_scope(self.name):
            net = slim.fully_connected(self.x, 25)
            net = slim.fully_connected(net, 25)
            self.y_pred = slim.fully_connected(net, self.outputs, activation_fn=None)
            loss = tf.losses.mean_squared_error(self.y, self.y_pred)
            self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(loss)

In [3]:
class DeepQAgent:
    def __init__(self, state_size, action_size, render=False):
        self.state_size = state_size
        self.action_size = action_size
        self.render = render
    
        # Contsts
        self.discount_factor = 0.99
        self.lr = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        self.memory = deque(maxlen=10000)

        # Models
#         tf.reset_default_graph()
        self.model = IFC('model', state_size, action_size, self.lr)
        self.target_model = IFC('target_model', state_size, action_size, self.lr)
        
        # Default TF pack
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
        self.saver = tf.train.Saver()

    def update_target_model(self):
        """Update your target model to the model you are currently learning at regular time intervals"""
        model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'model')
        tmodel_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'target_model')
        for x, y in zip(model_vars, tmodel_vars):
            self.sess.run(tf.assign(y, x.eval()))

    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(self.sess, state)[0])

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(self.sess, state)[0]

            if done:
                target[action] = reward
            else:
                max_q_value = np.max(self.target_model.predict(self.sess, next_state)[0])
                target[action] = reward + self.discount_factor * max_q_value
            update_input[i] = state
            update_target[i] = target

        self.model.fit(self.sess, update_input, update_target)

    def load_model(self, base='./checkpoints', model=None):
        try:
            if model is not None:
                last_chk_path = os.path.join(base, model)
            else:
                last_chk_path = tf.train.latest_checkpoint(checkpoint_dir=base)
            self.saver.restore(self.sess, save_path=last_chk_path)
        except:
            self.sess.run(tf.global_variables_initializer())

    def save_model(self, base='./checkpoints', model='model'):
        self.saver.save(self.sess, save_path=os.path.join(base, model))

In [4]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
action_size = 2  # env.action_space.shape[0]
state_size, action_size

(2, 2)

In [5]:
agent = DeepQAgent(state_size, action_size)
agent.load_model()
scores, episodes = [], []
N_EPISODES = 200

INFO:tensorflow:Restoring parameters from ./checkpoints/model


In [6]:
for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
#     print(state)

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing
    fake_action = 0

    # Counter for the same action 4 times
    action_count = 0

    while not done:
        if agent.render:
            env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1

        if action_count == 10:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100

        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            if e % 5 == 0:
                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                      "  epsilon:", agent.epsilon)

    # Save model for every 50 episodes
    if e % 20 == 0:
        agent.save_model()

episode: 0   score: -200.0   memory length: 200   epsilon: 0.9960200000000077
episode: 5   score: -200.0   memory length: 1177   epsilon: 0.9765777000000452
episode: 10   score: -200.0   memory length: 2122   epsilon: 0.9577722000000815
episode: 15   score: -200.0   memory length: 3075   epsilon: 0.938807500000118
episode: 20   score: -200.0   memory length: 4075   epsilon: 0.9189075000001564
episode: 25   score: -200.0   memory length: 5073   epsilon: 0.8990473000001947
episode: 30   score: -200.0   memory length: 6073   epsilon: 0.8791473000002331
episode: 35   score: -200.0   memory length: 7020   epsilon: 0.8603020000002695
episode: 40   score: -200.0   memory length: 7974   epsilon: 0.8413174000003061
episode: 45   score: -200.0   memory length: 8857   epsilon: 0.82374570000034
episode: 50   score: -200.0   memory length: 9857   epsilon: 0.8038457000003784
episode: 55   score: -200.0   memory length: 10000   epsilon: 0.7846223000004154
episode: 60   score: -200.0   memory length: 

In [7]:
agent.sess.close()