The idea of this code is a barebones implementation of Vanilla Policy Gradient (a.k.a REINFORCE / Monte Carlo Policy Gradient) for the CartPole-v0 environment. This solves the problem is approx. 200 episodes.

This code has not been fully unit tested / using TensorBoard / saving videos etc. to remove as much complexity as possible.

Code with the above bells and whistles will be held in a separate file.

In [1]:
import time
import sys
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
import gym

sys.path.append('../../../01_vrp/src/vrp/utils/')
import sm_functions as sm

In [2]:
class VanillaPolicyGradient:
    def __init__(self, gamma, learning_rate):
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.model = self._build_model()
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        
    def compute_action(self, state):
        '''Calculates $\pi(s, a)$'''
        probs = self.model(tf.convert_to_tensor([state]))
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        return action.numpy()[0]
    
    def learn(self, episode):
        '''After an episode, the agent loops through its memory and learns.'''
        self.discounted_return_memory = self._discount(self.reward_memory)
        
        with tf.GradientTape() as tape:
            total_loss = 0

            for idx, (state, discounted_return) in enumerate(zip(self.state_memory, self.discounted_return_memory)):
                
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                probs = self.model(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_probs = action_probs.log_prob(self.action_memory[idx])
                
                total_loss += -1 * discounted_return * tf.squeeze(log_probs)
                        
            grads = tape.gradient(total_loss, self.model.trainable_weights)
            self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
            
            with tensorboard_writer.as_default():
                tf.summary.scalar('Loss', total_loss, step=episode)
                
                for weights, gradient in zip(self.model.trainable_weights, grads):
                    tf.summary.histogram(weights.name, weights, step=episode)
                    tf.summary.histogram(weights.name + '_grads', gradient, step=episode)
                    
                tensorboard_writer.flush()
        
        # REMEMBER TO RESET MEMORY!!
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

        return self.discounted_return_memory
    
    def store_transitions(self, state, action, reward):
        '''Store individual state, action, reward transitions.'''
        self.state_memory.append(state)
        self.action_memory.append(action)
        self.reward_memory.append(reward)
            
    def _discount(self, reward_memory):
        '''Calculate v(s) for all states encountered'''
        cum_reward = 0
        discounted_return_memory = []
        reward_memory.reverse()
        
        for reward in reward_memory:
            cum_reward = reward + self.gamma * cum_reward
            discounted_return_memory.append(cum_reward)
        discounted_return_memory.reverse()
        
        return discounted_return_memory
            
    def _build_model(self):
        '''Define neural network'''
        inputs = tf.keras.Input(shape=(4,))
        x = tf.keras.layers.Dense(256, activation='relu')(inputs)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        outputs = tf.keras.layers.Dense(2, activation='softmax')(x)
        
        model = tf.keras.Model(inputs=inputs, outputs=outputs, name='VPG')
        return model

In [4]:
FLAG_SAVE = False
FOLDER_DESCRIPTION = 'cartpole'
FLAG_PROD = False
PATH_SAVE_RUNS = '../../runs/'

# Save all configs, scripts and logs (if desired) so that experiments are repeatable.
FOLDER_PATH_FULL = sm.generate_runs_folder(flag_save=FLAG_SAVE,
                                           folder_description=FOLDER_DESCRIPTION,
                                           flag_prod=FLAG_PROD,
                                           path_save_runs=PATH_SAVE_RUNS,
                                           path_config=None,
                                           path_train=None,
                                           path_env=None,
                                           path_policy=None)

start = time.time()
gamma = 0.99
learning_rate = 0.0005

env = gym.make('CartPole-v0')
agent = VanillaPolicyGradient(gamma=gamma, learning_rate=learning_rate)
print(agent.model.summary())

tensorboard_writer = tf.summary.create_file_writer(FOLDER_PATH_FULL)

n_episodes = 1000

for episode in range(n_episodes):
    state = env.reset()
    done = False
    returns = 0

    # Play an episode
    while not done:
        action = agent.compute_action(state)
        next_state, reward, done, info = env.step(action)
        agent.store_transitions(state, action, reward)
        state = next_state
        returns += reward

    # Learn from said episode
    agent.learn(episode)

    with tensorboard_writer.as_default():
        tf.summary.scalar('Return', returns, step=episode)
        
    env.close()
    
print(f'This took {time.time() - start} seconds to run.')

Model: "VPG"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               1280      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 514       
Total params: 67,586
Trainable params: 67,586
Non-trainable params: 0
_________________________________________________________________
None
This took 493.2263717651367 seconds to run.
