# Breakout - Policy Gradient Method w/ Custom Loss

## Policy Gradients Vanilla type
* Borrowed heavily from structure of: [Vanilla Policy Gradient](https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb)
* In order to determine positive and negative rewards we must associate it with some direction of the pole

In [1]:
import tensorflow as tf
from keras.utils import np_utils, to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, convolutional, pooling, Flatten, Dropout, Input, BatchNormalization
from keras.optimizers import Adam
from keras import backend as Bk
from scipy.misc import imread, imshow
import math
import numpy as np
import matplotlib.pyplot as plt
import gym
%matplotlib inline

Using TensorFlow backend.


In [2]:
env = gym.make('Breakout-v0')
print(env.observation_space)  # 210v x 160w x 3color Box (sparse tensor!!!)
print(env.action_space)       # 6x1 Discrete
gamma = 0.99
try: 
    xrange(1)
except:
    xrange = range

[2017-05-30 22:34:26,868] Making new env: Breakout-v0


Box(210, 160, 3)
Discrete(6)


In [13]:
class rl_agent():
    def __init__(self,  
                 environment, 
                 hidden_units = 1024,
                 epsilon = 1e-8,                  # Error - error for types of gradient descent or random choice
                 learning_rate = .01,
                 decay_rate = 1e-7,
                 random_choice_threshold = -2,
                 optimizer = Adam,
                 dropout_rate = 0.5,
                 frames = 3,
                 input_dim = (80,80,1),
                 output_dim = 6,
                 weight_iteration = 1
                ):
        self.environment = environment
        self.hidden_units = hidden_units # hidden neurons
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.optimizer = optimizer(lr=learning_rate, decay=decay_rate)
        self.dropout_rate = dropout_rate
        self.frames = frames
        self.input_dim = input_dim
        self.render = False
        self.prev_processed_state = None
        self.random_choice_threshold = random_choice_threshold
        self.output_dim = output_dim
        self.weight_i = weight_iteration
        self.__define_model(self.output_dim)
        self.__build_train_fn() # creates self.train_fn()
        
    def __define_model(self, output_dim):
        # Keras vars
        Bk.set_learning_phase(1) #set learning phase
        self.model = Sequential()
        # Default is channels last
        self.model.add(convolutional.Conv2D(filters=32, kernel_size = (2,2), strides=(2,2), padding='same', input_shape = (80,80, 1)))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        self.model.add(convolutional.Conv2D(filters=64, kernel_size = (2,2), strides=(2,2), padding='valid'))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        self.model.add(convolutional.Conv2D(filters=128, kernel_size = (3,3), strides=(2,2), padding='valid'))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        self.model.add(convolutional.Conv2D(filters=256, kernel_size = (3,3), strides=(2,2), padding='valid'))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        print(self.model.layers[-1].input_shape)
        print(self.model.layers[-1].output_shape)
        self.model.add(Flatten())
        self.model.add(Dense(self.hidden_units//2))   # Karpathy suggested N=200 in the hidden layer
        self.model.add(Dropout(self.dropout_rate))      # But since we are doing dropout to avoid overfitting... increase
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        self.model.add(Dense(self.hidden_units//2))   # Karpathy suggested N=200 in the hidden layer
        self.model.add(Dropout(self.dropout_rate))      # But since we are doing dropout to avoid overfitting... increase
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))
        print(self.model.layers[-1].input_shape)
        print(self.model.layers[-1].output_shape)
        self.model.add(Dense(self.output_dim, activation='softmax'))  # Actions are a 6x1 vector
        
        try:
             self.model.load_weights('saved_weights-'+str(self.weight_i)+'.h5')
        except:
            print("Training model without old weights")
            
        # State space size will be a 210x160x1 dim
        self.state = Input(shape=[80,80,1])
        
        print(self.model.summary())
            
    def __build_train_fn(self):
        # 6x1 tensor
        action_prob_placeholder = self.model.output
        # 6x1
        action_placeholder = Bk.placeholder(shape=(None, self.environment.action_space.n),
                                                  name="action_vector")
        # nx1
        discounted_reward_placeholder = Bk.placeholder(shape=(None,),
                                                    name="discount_reward")
        # ?? vectorize mult and sum, why not just do action_prob_placeholder transpose * action_onehot??
        action_prob = Bk.sum(action_prob_placeholder * action_placeholder, axis=1)
        # action_prob = action_prob_placeholder.transpose() * action_placeholder
        # log! 
        log_action_prob = Bk.log(action_prob)

        # -loss => because Theta = Theta + alpha * gradient * (gamma * r)
        loss = -log_action_prob * discounted_reward_placeholder
        loss = Bk.mean(loss)
        
        # This way we can access the weights
        updates = self.optimizer.get_updates(params=self.model.trainable_weights,
                                   constraints=[],
                                   loss=loss)
        
        # Feed into the model like the feed_dict in TF
        self.train_fn = Bk.function(inputs=[self.model.input,
                                           action_placeholder,
                                           discounted_reward_placeholder],
                                   outputs=[],
                                   updates=updates)
        
    def fit(self, S,A,R):
        action_onehot = to_categorical(A, num_classes=self.output_dim)
        discounted_normed_reward = apply_gamma(R)
        self.train_fn([S, action_onehot, discounted_normed_reward]) 
        # call what was built in build_train_fn which takes as input:
        # a list of [input states, categorical actions, and discounted rewards]
    
    def start_episode(self):
        S = []
        A = []
        R = []
        self.state = self.environment.reset()
        total_episode_reward = 0
        done = False
        while not done:
            s, a = self.choose_action() #overwrite s with difference of new state from the past
            s_next, r, done, info = self.environment.step(a)
            total_episode_reward += r
            S.append(s) # previous state
            A.append(a) # action from that state
            R.append(r) # reward from action from that state
            self.state = s_next
            if done:
                self.fit(S,A,R) # takes all the items of the episode from S_0,A_1,R_1,...,S_t-1,A_t,R_t
                return total_episode_reward
            
    def training(self):
        div = 200
        episode_number = 0
        group_of_rewards = []
        while True:
            reward_total_for_episode = self.start_episode()
            group_of_rewards.append(reward_total_for_episode)
            if episode_number % div == 0 and episode_number != 0:
                self.saveweights()
                mean_rewards = np.mean(group_of_rewards)
                print("Mean rewards for "+ str(div) + " episodes: "+ str(mean_rewards))
                group_of_rewards = []
            episode_number += 1
    
            
    def savemodel(self):
        with open('saved_model-'+str(self.weight_i)+'.json', 'w') as modelfile:
            modelfile.write(self.model.to_json())
            
    def saveweights(self):
        self.model.save_weights('saved_weights-'+str(self.weight_i)+'.h5', overwrite=True)
        
    def choose_action(self):
        #Probabilistically pick an action given our network outputs.
        self.state = self.preprocess(self.state)
        s = self.state
        if self.prev_processed_state is None:
            # prev_processed_state does not exist yet, so we take the difference from 0s but maybe we should be taking the difference from 1s
            self.prev_processed_state = np.ones((80, 80, 1), dtype='float32')/2  # Average field
        difference_processed = (s - self.prev_processed_state) # fit and predict off the difference values between steps
        self.prev_processed_state = s # update for next iteration calculation
        # Create a flattened np array of the result of the prediction a 6x1 vector from softmax
        self.action_probabilities = self.model.predict(difference_processed.reshape([-1,80,80,1])).flatten()
        # Introduce exploration based on a gaussian distribution, we use a very low probability for exploration
        # Such as testing if e < -.5,
        e = np.random.normal() # range from [~-3.5, ~3.5] where a distribution is according the CDF of the gaussian
        if (e < self.random_choice_threshold):
            action = np.random.choice(self.environment.action_space.n, 1)[0]
        else:
            # Choose according to the policy gradient (the prediction's probabilities become the distribution)
            action = np.random.choice(self.environment.action_space.n, 1, p=self.action_probabilities)[0]
        # Make the step and return the results to the train function (caller)
        return difference_processed, action
    
    def preprocess(self, state):
        # take a 210x160x3 image and convert to 210x160x1 as 210*160 length vector flattened F = float32 in dim
        # From https://docs.scipy.org/doc/scipy/reference/generated/scipy.misc.imread.html
        #r, g, b = state[:, :, 0], state[:, :, 1], state[:, :, 2]
        #F = r * 299.0/1000 + g * 587.0/1000 + b * 114.0/1000
        # Now using Karpathy 0/1 conversion of a majority of the surface area of the screen down to 80x80
        F = state
        F = F[35:195] # crop vertical rows to 160
        F = F[::2,::2,0] # downsample by factor of 2 to 80x80 just keeps the first index of the 3rd dim
        F[F != 0] = 255 # everything else (paddles, ball) just set to 1
        F[F != 255] = 0
        return F.reshape([80,80,1])

In [None]:
def apply_gamma(r):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, len(r))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    # normalize and return
    return (discounted_r - discounted_r.mean()) / (discounted_r.std()+1e-9)

In [None]:
agent = rl_agent(
    learning_rate = .001,
    decay_rate = 1e-8,
    random_choice_threshold = -2,
    optimizer = Adam,
    dropout_rate = 0.5,
    environment=env,
    hidden_units=1024,
    epsilon=1e-8,
    weight_iteration=2
    )
agent.training()

(None, 4, 4, 256)
(None, 4, 4, 256)
(None, 512)
(None, 512)
Training model without old weights
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 40, 40, 32)        160       
_________________________________________________________________
batch_normalization_19 (Batc (None, 40, 40, 32)        128       
_________________________________________________________________
activation_19 (Activation)   (None, 40, 40, 32)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 20, 20, 64)        8256      
_________________________________________________________________
batch_normalization_20 (Batc (None, 20, 20, 64)        256       
_________________________________________________________________
activation_20 (Activation)   (None, 20, 20, 64)        0         
_______________________________________________



Mean rewards for 200 episodes: 1.19900497512
Mean rewards for 200 episodes: 1.15
Mean rewards for 200 episodes: 1.02
Mean rewards for 200 episodes: 1.125
Mean rewards for 200 episodes: 1.215
Mean rewards for 200 episodes: 1.305
Mean rewards for 200 episodes: 1.375
Mean rewards for 200 episodes: 0.99
Mean rewards for 200 episodes: 1.34
Mean rewards for 200 episodes: 1.02
Mean rewards for 200 episodes: 1.235
Mean rewards for 200 episodes: 1.175
Mean rewards for 200 episodes: 1.255
Mean rewards for 200 episodes: 1.255
Mean rewards for 200 episodes: 1.21
Mean rewards for 200 episodes: 1.48
Mean rewards for 200 episodes: 1.15
Mean rewards for 200 episodes: 1.44
Mean rewards for 200 episodes: 1.305
Mean rewards for 200 episodes: 1.275
Mean rewards for 200 episodes: 1.18
Mean rewards for 200 episodes: 1.195
Mean rewards for 200 episodes: 1.075
Mean rewards for 200 episodes: 1.17
Mean rewards for 200 episodes: 1.035
Mean rewards for 200 episodes: 1.115
Mean rewards for 200 episodes: 1.01
Mean 

In [None]:
plt.imshow(agent.preprocess(agent.state))

In [23]:
plt.hist(agent.reward_tensor)

[200  72  72]
