# Cross Entropy Mountain Car #

## Imports ##

In [58]:
import gym
import tensorflow as tf
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import time

## Environment ##

In [59]:
# Create the Mountain-Car game environment
env = gym.make('MountainCar-v0')

def compute_modified_reward(next_state):
    modified_reward = np.square(max(0, next_state[0] + 0.5))
    if next_state[0] >= 0.5: 
        modified_reward += 1.0
    return modified_reward

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Hyperparameters ##

Network has to be complex enough. Batch size has to be large enough.

In [60]:
# Environment parameters
state_size = 2
action_size = 3

# Training parameters
max_episodes = 501          # number of episodes to train
max_steps = 200                # max number of step in an episode
gamma = 0.99                   # future reward discount
percentile = 70

# Q neural network parameters
hidden_layer_size = 128         # number of neurons in first hidden layer
learning_rate = 0.01         # learning rate

## Neural network ##

In [61]:
# first hidden layer 128 neurons

class QNN:
    def __init__(self, 
                 state_size = state_size, 
                 action_size = action_size, 
                 first_layer_size = first_layer_size,
                 learning_rate = learning_rate, 
                 name = 'qnn'):
        
        # state inputs to the Q-network
        with tf.variable_scope(name):
    
            # Prediction part of the QNN
            # computes q(s, a) for given state s and all actions a
            # so best action can be determined as argmax_a q(s, a)
    
            # Input layer, state s is input
            self.state = tf.placeholder(tf.float32, 
                                        [None, state_size])
            
            # First hidden layer, ReLU activation
            self.first_layer = tf.contrib.layers.fully_connected(self.state, 
                                                                 first_layer_size)

            # Output layer, linear activation, q_vector(s, a) is output
            self.logits = tf.contrib.layers.fully_connected(self.first_layer,
                                                              action_size,
                                                              activation_fn = None)
    
            # Output layer
            self.probabilities = tf.nn.softmax(self.logits)
            
            # Action a
            self.actions = tf.placeholder(tf.int32, 
                                         [None])
            # One-hot encoded action a 
            #
            # encoded_action_vector = [1, 0] if action a = 0
            # encoded_action_vector = [0, 1] if action a = 1
            self.one_hot_actions = tf.one_hot(self.actions, 
                                               action_size)
    
            self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits = self.logits,
            labels = self.one_hot_actions)
    
            # cost
            self.cost = tf.reduce_mean(self.cross_entropy)
            
            # Optimizer
            self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
            
    # get best action 
    def get_action(self, state):
        feed_dict = { self.state : np.array([state]) } 
        probabilities = sess.run(self.probabilities, feed_dict = feed_dict)
        return np.random.choice(action_size, p=probabilities[0])
        
    # train based on a batch of data from 
    def train(self, batch):    
        states, actions = zip(*batch)
        states = np.array(states)
        actions = np.array(actions)
        
        feed_dict = {
            self.states : states,
            self.actions : actions
        }

        sess.run(self.optimizer, feed_dict = feed_dict)
        

## Training ##

In [62]:
tf.reset_default_graph()
qnn = QNN(name = 'qnn',
          first_layer_size = first_layer_size,
          learning_rate = learning_rate)

In [63]:
import random
import bisect
import time

with tf.Session() as sess:
    
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    start_index = int(max_episodes * percentile / 100)
    
    while True:
        
        total_reward_list = []
        trajectory_list = []
        
        for e in np.arange(max_episodes):
            total_reward = 0.0
            trajectory = []
            state = env.reset()
            for s in np.arange(max_steps):
                action = net.get_action(state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                trajectory.append((state, action))
                state = next_state
                if done: break
                    
            indedx = bisect.bisect(total_reward_list, total_reward)
            total_reward_list.insert(index, total_reward)
            trajectory_list.insert(index, trajectory)
            
        state_action_pairs = []
        for trajectory in trajectory_list[start_index]:
            for state_action_pair in trajectory:
                state_action_pairs.append(state_action_pair)
        # shuffle to avoid correlations between adjacent states
        random.shuffle(state_action_pairs) 
        n = len(state_action_pairs)
        batches = [state_action_pairs[k:k + batch_size] for k in np.arange(0, n, batch_size)]

        for batch in batches:
            net.train(batch)

        # test agent
        state = env.reset()
        env.render()
        time.sleep(0.05)
        total_reward = 0.0
        for s in np.arange(max_steps):
            action = net.get_action(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            env.render()
            time.sleep(0.05)
            if done: break
                
        env.close()
        print("Total reward:", total_reward)
        
        if total_reward != -200:
            break    
        