# Cart Pole - Policy Gradient Method
## Policy Gradients Vanilla type
* Borrowed heavily from structure of: [Vanilla Policy Gradient](https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb)
* In order to determine positive and negative rewards we must associate it with some direction of the pole

In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim  # Easy model building, not quite sure yet how to build nodes by hand
import numpy as np
import matplotlib.pyplot as plt
import gym
%matplotlib inline

In [3]:
env = gym.make('CartPole-v0')
print(env.observation_space)  # 4x1 Box (vector-like)
print(env.action_space)       # 2x1 Discrete
gamma = 0.99

[2017-04-22 22:04:43,472] Making new env: CartPole-v0


Box(4,)
Discrete(2)


In [4]:
class rl_agent():
    def __init__(self,  
                 environment, 
                 state_space_size, 
                 action_space_size,         # For CartPole-v0
                 history_size,
                 learning_rate,             # Lambda or other for gradient descent
                 epsilon,                   # Error - error for types of gradient descent or random choice
                 gradient_descent_function):
        self.environment = environment
        self.history_size = history_size
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.GDFunction = gradient_descent_function
        
        # TF Vars
        self.state = tf.placeholder(shape=[None,self.state_space_size], dtype=tf.float32)
        
        # ReLu is default
        self.hidden_layer = slim.fully_connected(self.state,
                                                 self.history_size,
                                                 biases_initializer=None,
                                                 activation_fn=tf.nn.relu)
        # Output with softmax (only two possible choices, but faster to do softmax)
        self.output_layer = slim.fully_connected(self.hidden_layer,
                                                 self.action_space_size,
                                                 biases_initializer=None,
                                                 activation_fn=tf.nn.softmax)
        self.current_action = tf.argmax(self.output_layer,1)
        self.reward_tensor = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_tensor = tf.placeholder(shape=[None],dtype=tf.int32)
        
        # Range from 0 to the output dimension -- an index range [ 0, 1, 2...
        output_range = tf.range(0, tf.shape(self.output_layer)[0])
        # Basically this outputs the action 
        self.indexes = output_range * tf.shape(self.output_layer)[1] + self.action_tensor
        # Formed an action tensor 
        self.output_tensor = tf.gather(tf.reshape(self.output_layer, [-1]), self.indexes)
        # Basically a spread and gather according to the index yielding output
        # which will be like y * y^ using the reduce mean here so we can reduce to 1x1
        self.loss_function = -tf.reduce_mean(tf.log(self.output_tensor)*self.reward_tensor)
        
        self.trainable_variables = tf.trainable_variables()
        
        self.gradient_holders = []
        # Each trainable variable is a partial derivative, but here they are just placeholders
        for idx,var in enumerate(self.trainable_variables):
            temp = tf.placeholder(tf.float32, name=str(idx)+'_holder')
            self.gradient_holders.append(temp)
        
        self.gradients = tf.gradients(self.loss_function, self.trainable_variables)
        
        optimizer = self.GDFunction(learning_rate=self.learning_rate)
        self.updated_weights = optimizer.apply_gradients(zip(self.gradient_holders,self.trainable_variables))
    
    def set_up_gradient_holder(self, session):
        self.grad_buffer=session.run(self.trainable_variables)
        for index, gradient in enumerate(self.grad_buffer):
            self.grad_buffer[index] = gradient * 0 # here we just want the sizes

    def update_gradients(self, session, feed_dict):
        tempgradients = session.run(self.gradients, feed_dict=feed_dict)
        for index, gradient in enumerate(tempgradients):
            self.grad_buffer[index] -= gradient

    def update_batch_gradients(self, session):
        feed_dict = dictionary = dict(zip(self.gradient_holders, self.grad_buffer))
        _ = session.run(self.updated_weights, feed_dict=feed_dict)
        for index, gradient in enumerate(self.grad_buffer):
            self.grad_buffer[index] = gradient * 0
        
    def choose_action(self, session, s):
        #Probabilistically pick an action given our network outputs.
        action_probabilities = session.run(self.output_layer,feed_dict={self.state:[s]})
        action = np.random.choice(action_probabilities[0],p=action_probabilities[0])
        action = np.argmax([action_probabilities == action])
        new_state, reward, done, info = self.environment.step(action) 
        return [s,action, new_state, reward], done
            

In [5]:
def apply_gamma(list_of_rewards):
    index = np.arange(len(list_of_rewards))
    gammas = np.power(gamma, index) # Awjuliani misses this step
    vectorized_mult = np.multiply(gammas, list_of_rewards)
    return vectorized_mult
    

In [None]:
tf.reset_default_graph()
repeats = 999
max_games = 5000
agent = rl_agent(
    environment=env,
    state_space_size=4,
    action_space_size=2,
    history_size=8,
    learning_rate=0.01,
    epsilon=1e-8, 
    gradient_descent_function=tf.train.AdamOptimizer)

init = tf.global_variables_initializer()
with tf.Session() as session:
    init = session.run(init)
    i = 0
    rewards = []
    time_above_ground = []
    agent.set_up_gradient_holder(session)
    while i < max_games:
        state = env.reset()
        reward_for_game = 0
        history = [] # 4 items from past
        for j in xrange(repeats):
            history_element, done = agent.choose_action(session, state)
            history.append(history_element)
            state = history_element[-2]
            reward_for_game += history_element[-1]
            if done:
                history = np.array(history)
                history[:,3] = apply_gamma(history[:,3])
                feed_dict = {agent.reward_tensor: history[:,3],
                             agent.action_tensor: history[:,1],
                             agent.state: np.vstack(history[:,0])}
                agent.update_gradients(session, feed_dict)
                if i % 5 == 0 and i != 0:
                    agent.update_batch_gradients(session)
                rewards.append(reward_for_game)
                time_above_ground.append(j)
                break
        if i % 100 == 0:
            print(np.mean(rewards[-100:]))
        i += 1

25.0
24.67
35.93
40.8
50.63
54.08
68.49
72.35
81.17
115.07
128.46
151.35
150.37
153.82
152.45
147.16
145.61
159.52
165.11
180.95
187.16
180.48
185.74
183.32
185.3
