## Imports

In [1]:
import gym
import tensorflow as tf
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import time

  from ._conv import register_converters as _register_converters


## Environment

In [2]:
# Create the Mountain Car game environment
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Hyperparameters

In [3]:
# Environment parameters
state_size = 2
action_size = 3

hidden_layer_size = 128

batch_size = 25

learning_rate = 0.01

max_episodes = 100

max_steps = 200
percentile = 70

## Neural network

In [4]:
class Net:
    def __init__(self, 
                 state_size = state_size, 
                 action_size = action_size, 
                 hidden_layer_size = hidden_layer_size,
                 learning_rate = learning_rate, 
                 name = 'net'):
        
        with tf.variable_scope(name):
        
            ### Prediction part
        
            # Input layer, state s is input
            self.states = tf.placeholder(
                tf.float32, 
                [None, state_size])
            
            # Hidden layer, ReLU activation
            self.hidden_layer = tf.contrib.layers.fully_connected(
                self.states, 
                hidden_layer_size)
            
            # Hidden layer, linear activation, logits
            self.logits = tf.contrib.layers.fully_connected(
                self.hidden_layer, 
                action_size,
                activation_fn = None)
            
            # Output layer, softmax activation yields probability distribution for actions
            self.probabilities = tf.nn.softmax(self.logits)
    
            ### Training part 
    
            # Action a
            self.actions = tf.placeholder(
                tf.int32, 
                [None])
            
            # One-hot encoded action a 
            #
            # encoded_action_vector = [1, 0] if action a = 0
            # encoded_action_vector = [0, 1] if action a = 1
            self.one_hot_actions = tf.one_hot(
                self.actions, 
                action_size)

            # cross entropy
            self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits = self.logits, 
                labels = self.one_hot_actions)
            
            # cost
            self.cost = tf.reduce_mean(self.cross_entropy)
            
            # Optimizer
            self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
            
    # get action chosen according to current probabilistic policy
    def get_action(self, state):
        feed_dict = { self.states : np.array([state]) } 
        probabilities = sess.run(self.probabilities, feed_dict = feed_dict)
        
        return np.random.choice(action_size, p=probabilities[0])
    
    # train based on batch
    def train(self, batch):
        states, actions = zip(*batch)
        states = np.array(states)
        actions = np.array(actions)
        
        feed_dict = {
            self.states : states,
            self.actions : actions
        }
        
        sess.run(self.optimizer, feed_dict = feed_dict)

## Training

In [26]:
tf.reset_default_graph()
net = Net(name = 'net',
          hidden_layer_size = hidden_layer_size,
          learning_rate = learning_rate)

import random
import bisect
import time


with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    
    start_index = int(max_episodes * percentile / 100)
    
    while True:

        total_reward_list = []
        trajectory_list = []

        for e in np.arange(max_episodes):
            total_reward = 0.0
            flagCaptured = False
            trajectory = []
            beenHere = []
            maxPosition = -2 #out of bounds, similar to saying negative infinity
            maxVelocity = -1 #out of bounds
            
            state = env.reset()
            for s in np.arange(max_steps):
                action = net.get_action(state)
                next_state, reward, done, _ = env.step(action)
                
                #position = int(state[0] * 10) # * 10 makes the array have more positions
                #if position not in beenHere:
                    # get big reward for achieving a new position
                    #reward = 100  
                    #beenHere.append(position)
                #else:
                    #reward = -1
                total_reward += abs(state[0]) + abs(state[1] * 10)  - 1
                trajectory.append((state, action))
                state = next_state
                if done: 
                    if s < max_steps - 1:
                        flagCaptured = True
                    break

            index = bisect.bisect(total_reward_list, total_reward)
            total_reward_list.insert(index, total_reward)
            trajectory_list.insert(index, trajectory)
        
        # keep the elite episodes, that is, throw out the bad ones 
        # train on state action pairs extracted from the elite episodes
        # this code is not optimized, it can be cleaned up 
        state_action_pairs = []
        for trajectory in trajectory_list[start_index:]:
            for state_action_pair in trajectory:
                state_action_pairs.append(state_action_pair)
        # shuffle to avoid correlations between adjacent states
        random.shuffle(state_action_pairs) 
        n = len(state_action_pairs)
        batches = [state_action_pairs[k:k + batch_size] for k in np.arange(0, n, batch_size)]

        for batch in batches:
            net.train(batch)

        # test agent
        state = env.reset()
        env.render()
        time.sleep(0.00)
        total_reward = 0.0
        flagCaptured = False
        beenHere = []
        
        for s in np.arange(max_steps):
            action = net.get_action(state)
            state, reward, done, _ = env.step(action)
            
            position = int(state[0] * 10) # * 10 makes the array have more positions
            if position not in beenHere:
                # get big reward for achieving a new position
                reward = 100  
                beenHere.append(position)
            else:
                reward = -1
            total_reward += abs(state[0]) + abs(state[1] * 10) - 1
            env.render()
            time.sleep(0.00)
            if done: 
                if s < max_steps - 1:
                    flagCaptured = True
                break

        env.close()
        print("Total reward:", total_reward)
        
        #if total_reward == 200:
        if flagCaptured:
            print("flag captured")
            print(s)
            break
        else:
            print(beenHere)
            #print(state[0])

Total reward: -70.9612719090086
[-4, -5, -6, -3, -7, -2]
Total reward: -79.96719031356223
[-4, -5, -6]
Total reward: -92.4528982466886
[-5, -4]
Total reward: -86.58178991590161
[-4, -5, -6, -3]
Total reward: -89.94591679990641
[-5, -4]
Total reward: -87.2551698457722
[-5, -4]
Total reward: -83.84203792506888
[-5, -4, -6]
Total reward: -76.13703829930783
[-5, -6, -4]
Total reward: -82.14178069678485
[-5, -6, -4]
Total reward: -82.61028613237418
[-5, -4, -6]
Total reward: -68.33161569608025
[-5, -6, -4, -3, -7]
Total reward: -84.98128536765427
[-5]
Total reward: -79.56593656951215
[-5, -6, -4]
Total reward: -74.6839769413467
[-4, -5, -6]
Total reward: -78.57074284170127
[-5, -6, -4]
Total reward: -76.44526363779507
[-5, -6, -4, -7]
Total reward: -82.09869898727241
[-5, -6]
Total reward: -53.89338893339351
[-5, -6, -4, -3, -7, -8, -2, -1, -9]
Total reward: -69.02376476794123
[-4, -5, -6, -7, -3]
Total reward: -69.75473959808511
[-4, -5, -6, -7]
Total reward: -82.67732937033085
[-5, -6]
To

KeyboardInterrupt: 