# Breakout - Policy Gradient Method
## Policy Gradients Vanilla type
* Borrowed heavily from structure of: [Vanilla Policy Gradient](https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb)
* In order to determine positive and negative rewards we must associate it with some direction of the pole

In [1]:
import tensorflow as tf
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, convolutional, pooling, Flatten, Dropout, Input
from keras.optimizers import Adam
from keras import backend as Bk
from scipy.misc import imread, imshow
import math
import numpy as np
import matplotlib.pyplot as plt
import gym
%matplotlib inline

Using TensorFlow backend.


In [2]:
env = gym.make('Breakout-v0')
print(env.observation_space)  # 210v x 160w x 3color Box (sparse tensor!!!)
print(env.action_space)       # 6x1 Discrete
s = env.reset()
print(s.shape)
gamma = 0.99
try: 
    xrange(1)
except:
    xrange = range

[2017-05-06 00:20:58,055] Making new env: Breakout-v0


Box(210, 160, 3)
Discrete(6)
(210, 160, 3)


In [3]:
class rl_agent():
    def __init__(self,  
                 environment, 
                 state_space_size, 
                 action_space_size,               
                 hidden_units = 1096,
                 learning_rate = 1e-2,             # Lambda or other for gradient descent
                 epsilon = 1e-8,                  # Error - error for types of gradient descent or random choice
                 random_choice_threshold = -.72,
                 optimizer = Adam,
                 dropout_rate = 0.4,
                 load_file_weights = None,
                 verbose = 0,
                 frames = 3
                ):
        self.environment = environment
        self.hidden_units = hidden_units # hidden neurons
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.optimizer = optimizer
        self.dropout_rate = dropout_rate
        self.load_file_weights = load_file_weights
        self.verbose = verbose
        self.frames = frames
        self.input_dim = 210*160
        self.prev_processed_state = None
        self.past_differences = []
        self.render = False
        self.random_choice_threshold = random_choice_threshold
        
    def define_model(self):
        # Keras vars
        self.model = Sequential()
        self.model.add(convolutional.Conv2D(filters=32, kernel_size = (5,5), strides=(1,1), padding='same',
                                            input_shape = (3, 210, 160), activation='relu'))
        self.model.add(pooling.MaxPooling2D(pool_size=(2,2), padding='same'))
        self.model.add(convolutional.Conv2D(filters=64, kernel_size = (5,5),
                                            strides=(1, 1), padding='same', activation='relu'))
        self.model.add(pooling.MaxPooling2D(pool_size=(2,2), padding='same'))
        self.model.add(Flatten())
        self.model.add(Dense(self.hidden_units, activation='relu'))   # Karpathy suggested N=200 in the hidden layer
        self.model.add(Dropout(self.dropout_rate))      # But since we are doing dropout to avoid overfitting... increase
        self.model.add(Dense(6, activation='softmax'))  # Actions are a 6x1 vector
        optimizer = self.optimizer(lr=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon = self.epsilon, decay=0.0)
        self.model.compile(
            optimizer = optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        try:
             self.model.load_weights('saved_weights5.h5')
        except:
            print("Training model without old weights")
            
        # State space size will be a 210x160x1 dim
        self.state = Input(shape=self.state_space_size)
        self.state1 = Input(shape=self.state_space_size)
        #  self.current_action = tf.argmax(self.output_layer,1)
        self.reward_tensor = Input(shape=(1,), dtype='float32')
        self.action_tensor = Input(shape=(1,), dtype='int32')
        
        if self.verbose >= 1:
            print(self.model.summary())
        
    def training(self, max_games = 100000):
        mean_rewards_list = []
        self.state = self.environment.reset()
        chosen_vectors, rewards = [], []
        running_reward = 0
        reward_for_episode = 0
        i = 0
        while True:
            history_element, done = self.choose_action()
            chosen_vectors.append(history_element[4])
            reward_for_episode += float(history_element[3])
            rewards.append(history_element[3])
            if done:
                i += 1
                all_episode_features = np.vstack(agent.past_differences)
                all_episode_chosen_vectors = np.vstack(chosen_vectors)
                all_episode_rewards = np.vstack(rewards)
                discounted_episode_rewards = apply_gamma(all_episode_rewards)
                # Standard normal feature scaling 
                std_dev = np.std(discounted_episode_rewards)+self.epsilon
                discounted_episode_rewards = (discounted_episode_rewards - np.mean(discounted_episode_rewards+self.epsilon))/std_dev
                all_episode_chosen_vectors = np.multiply(discounted_episode_rewards.transpose(), all_episode_chosen_vectors.transpose()).transpose()
                self.model.fit(all_episode_features.reshape([-1, 3, 210,160]), all_episode_chosen_vectors, epochs=3, verbose=0, shuffle=True)
                self.prev_processed_state = None
                mean_rewards = np.mean(all_episode_rewards)
                reward_for_episode = 0
                self.state = self.environment.reset()
                if i%20 == 0 and not i == 0:
                    self.past_differences = []
                    chosen_vectors = []
                    rewards = []
                    self.saveweights()
                    mean_rewards_list.append(mean_rewards)
                    print("MEAN REWARDS:" + str(mean_rewards))
                    
    def savemodel(self):
        with open('saved_model5.json') as modelfile:
            modelfile.write(self.model.to_json())
            
    def saveweights(self):
        self.model.save_weights('saved_weights5.h5', overwrite=True)
        
    def choose_action(self):
        #Probabilistically pick an action given our network outputs.
        #s = self.preprocess(self.state)
        s = self.state.reshape([3,210,160])
        self.prev_processed_state = self.prev_processed_state if self.prev_processed_state is not None else np.zeros((3,210, 160), dtype='float32')
        difference_processed = (s - self.prev_processed_state)
        self.prev_processed_state = s
        # difference_processed = difference_processed.flatten()
        self.past_differences.append(difference_processed.flatten())
        self.action_probabilities = self.model.predict(difference_processed.reshape([-1,3,210,160])).flatten()
        e = np.random.normal()
        if (e < self.random_choice_threshold):
            action = np.random.choice(self.environment.action_space.n, 1)[0]
        else:
            action = np.random.choice(self.environment.action_space.n, 1,p=self.action_probabilities/np.sum(self.action_probabilities))[0]
        self.state, reward, done, info = self.environment.step(action)
        if self.render: self.environment.render()
        chosen_vector = np.zeros([self.environment.action_space.n])
        chosen_vector[action] = 1
        return [s,action, self.state, reward, chosen_vector], done
    
    def preprocess(self, state):
        # take a 210x160x3 image and convert to 210x160x1 as 210*160 length vector flattened F = float32 in dim
        # From https://docs.scipy.org/doc/scipy/reference/generated/scipy.misc.imread.html
        r, g, b = state[:, :, 0], state[:, :, 1], state[:, :, 2]
        F = r * 299.0/1000 + g * 587.0/1000 + b * 114.0/1000
        return F

In [4]:
def apply_gamma(r):
    discounted_r = np.zeros_like(r)
    running_add = 0
    r = r.flatten()
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [5]:
agent = rl_agent(
    environment=env,
    state_space_size=(3,210,160),
    action_space_size=6,
    hidden_units=1096,
    epsilon=1e-8, 
    optimizer=Adam,
    verbose=1)
agent.define_model()
agent.training()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 3, 210, 32)        128032    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 105, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 2, 105, 64)        51264     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 53, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3392)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1096)              3718728   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1096)              0         
__________

MEAN REWARDS:0.00562136117246
MEAN REWARDS:0.00555555555556
MEAN REWARDS:0.00428658909982
MEAN REWARDS:0.00616332819723
MEAN REWARDS:0.00537118741608
MEAN REWARDS:0.00702070207021
MEAN REWARDS:0.00544396470809
MEAN REWARDS:0.00678692623683
MEAN REWARDS:0.00607287449393
MEAN REWARDS:0.00768156424581
MEAN REWARDS:0.00554387306442
MEAN REWARDS:0.0061773255814
MEAN REWARDS:0.00441412520064
MEAN REWARDS:0.00419903422213
MEAN REWARDS:0.00506996552423
MEAN REWARDS:0.0065717415115
MEAN REWARDS:0.00573122529644
MEAN REWARDS:0.00648628613788
MEAN REWARDS:0.00601970083911
MEAN REWARDS:0.00516647531573
MEAN REWARDS:0.00561692566935
MEAN REWARDS:0.00486125177233
MEAN REWARDS:0.00519480519481
MEAN REWARDS:0.00478373529998
MEAN REWARDS:0.00563654033042
MEAN REWARDS:0.00604996096799
MEAN REWARDS:0.00628092881614
MEAN REWARDS:0.00683945284377
MEAN REWARDS:0.0044998977296
MEAN REWARDS:0.00646950092421
MEAN REWARDS:0.00676397294411
MEAN REWARDS:0.00620767494357
MEAN REWARDS:0.00552380952381
MEAN REWARDS:

KeyboardInterrupt: 