In [None]:
import gym
import pylab
import random
import tflearn
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

from collections import deque

%matplotlib inline
from IPython import display

## Task: fill empty spaces in the following agent code

In [None]:
class Model:
    def __init__(self, name, state_size, action_size, learning_rate = 0.001):
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        self.X = tf.placeholder(tf.float32, [None, state_size])
        self.y = tf.placeholder(tf.float32, [None, action_size])
        
        self.y_pred, self.optimizer = self._build_model()
        
    def _build_model(self):
        with tf.variable_scope(self.name):
            layer_1 = slim.fully_connected(self.X, 20)
            layer_2 = slim.fully_connected(layer_1, 20)
            y_pred = slim.fully_connected(layer_2, self.action_size, activation_fn=None)
            loss = tf.losses.mean_squared_error(self.y, y_pred)
            optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
            return y_pred, optimizer
    
    def predict_reward(self, sess, state):
        return sess.run(self.y_pred, feed_dict={self.X: state})[0]
    
    def predict_action(self, sess, state):
        return np.argmax(self.predict_reward(sess, state))
    
    def train(self, sess, X, y):
        sess.run(self.optimizer, feed_dict={self.X: X, self.y: y})
    
class DeepQAgent:
    def __init__(self, state_size, action_size, render=True):
        self.render = render
        self.state_size = state_size
        self.action_size = action_size
        
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        
        # replay memory
        self.memory = deque(maxlen=10000)
        
        self.model = Model('model', state_size, action_size)
        self.target_model = Model('target_model', state_size, action_size)
        
        self.sess = tf.InteractiveSession()
        self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
        self.saver = tf.train.Saver()
        
    def update_target_model(self):
        """Update your target model to the model you are currently learning at regular time intervals"""
        
        vars_1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'target_model')
        vars_2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'model')

        for x, y in zip(vars_1, vars_2):
            self.sess.run(tf.assign(x, y.eval()))
    
    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return self.model.predict_action(self.sess, state)

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
            
        self.memory.append((state, action, reward, next_state, done))
        
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    
    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        
        if len(self.memory) < self.train_start:
            return
        
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict_reward(self.sess, state)

            # As in queuing, it gets the maximum Q Value at s'. However, it is imported from the target model.
            if done:
                # target[action] = reward
                target[action] = 1000
            else:
                max_q = np.max(self.target_model.predict_reward(self.sess, next_state))
                target[action] = reward + self.discount_factor * max_q
            
            update_input[i] = state
            update_target[i] = target

        self.model.train(self.sess, update_input, update_target)

    def load_model(self):
        try:
            print('Trying to restore last checkpoint...')
            last_chk_path = tf.train.latest_checkpoint(checkpoint_dir="./checkpoints/")
            self.saver.restore(self.sess, save_path=last_chk_path)
            print('Restored checkpoint from:', last_chk_path)
        except:
            print('Failed to restore checkpoint. Initializing variables instead.')
            self.sess.run(tf.global_variables_initializer())

    def save_model(self, name):
        print('Saving model...')
        self.saver.save(self.sess, save_path='checkpoints/' + name)

In [None]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0] # should be equal 2

ACTION_SIZE = 2

In [None]:
agent.sess.close()

In [None]:
tf.reset_default_graph()
agent = DeepQAgent(state_size, ACTION_SIZE)
agent.load_model()
scores, episodes = [], []

N_EPISODES = 200

In [None]:
agent.render = False

In [None]:
for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing)
    fake_action = 0

    # Counter for the same action 6 times
    action_count = 0

    while not done:
        if agent.render:
            env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1
                
        if action_count == 6:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        next_state = np.round(next_state, 3)
        
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100
        
        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            print('episode:', e, '  score:', score, '  memory length:', len(agent.memory), '  epsilon:', agent.epsilon)

    # Save model for every 200 episodes
    if e % 50 == 0:
        agent.save_model('model')