# Reinforcement Learning Part 2: Policy Gradients

## Advantage Actor-Critic Policy Gradients

In [None]:
%matplotlib inline

import numpy as np
import gym
from keras.engine import training
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation
import tensorflow as tf
from sys import stdout
from os import path, makedirs
import json
from itertools import count
import matplotlib.pyplot as plt

In [None]:
np.random.seed(0)

In [None]:
ENV = gym.make("LunarLander-v2")
NUM_INPUT = ENV.reset().shape[0]
NUM_ACTION = ENV.action_space.n
ACTIONS = np.arange(0, NUM_ACTION)

In [None]:
NUM_EPISODE = 50
 
STEP_MEM_RANGE = np.arange(1, 7) # Constant over one epoch
NUM_HIDDEN_NEURON_RANGE = [256] # Constant over one epoch
INITIALIZATIONS = ['uniform', 'lecun_uniform', 'normal', 'identity', 'orthogonal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] # Constant over one epoch
ACTIVATIONS = ['softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] # Constant over one epoch
GAMMA = 0.99
 
SAVE_PATH = path.join(path.expanduser("~"), "RL_Saves")
try:
    makedirs(SAVE_PATH)
except OSError:
    if not path.isdir(SAVE_PATH):
        raise

## Policy Gradient Models: A Methododical Overview

At every step, the agent faces the decision of taking one of the available actions. When applying a policy gradient model, the agent estimates which move is most likely to lead to a good end / a high reward: It assigns probabilities to every action. In order to accomplish this, we need a first ANN block, which outputs probabilities / uses a softmax at the output layer. It is called the *policy estimator*.

After this, it samples uniformly from the probability distribution in order to get an action, and observes the consequences of the action. 

The implemented advantage actor-critic policy gradient model updates the policy in batch mode after having terminated an episode. It so to speak does a retrospection on a completed episode, with ex-post knowledge of the episode's outcome and all its moves and direct consequences.

The agent therefore needs to keep track of transition ```s_t, a_t, r_t``` as well as the probability of the action taken ```probs[a_t]``` for every step ```t``` the probability during a completed episode. 

After having finished, the agent takes this episode experience and walks through it again step by step, computing the relevant metrics at every step ```t```.

Policy gradient models operate with non-standard gradients. In general, we need a ```target``` and a ```baseline``` in order to compute the so called ```advantage```. 

The ```target``` component differs for the different types of policy gradient models: for advantage actor-critic models, it is the cumulated, discounted (again, with discount factor ```GAMMA```) reward for the remaining steps, at step ```t```. Computing this ex post is not difficult!

The ```baseline``` is the estimate of is *estimate* of the cumulated, discounted reward for the remaining steps, at step ```t```. For this, we need a second ANN block, the so called *value estimator*, outputting a regression estimate by applying a mean squared error loss function.

The ```advantage``` is nothing more than the absolute difference of the ```target``` to its estimation, the ```baseline```.  

The loss at ```t``` eventually is the cross-entropy, given the probability of the action taken (```probs[a_t]```), and the ```advantage``` at ```t```.

In order to the model on this loss in Keras, I needed to implement a helper function ```get_trainable_params```, as proposed by the helpful minds at https://github.com/fchollet/keras/issues/3062. A big thank you for that! Getting the gradients subsequently is relying directly on TensorFlow, applying ```tf.gradients()```.

Having said all this, it is time to implement the model and testing routines:

In [None]:
def _create_network(model_type): 
    model = Sequential()
    
    if model_type == 'value':
        # Regression function estimate for calculating the advantage
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_V, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_V))
        model.add(Dense(1, init=INITIALIZATION_V))
        model.compile(optimizer='rmsprop', loss='mse')
    
    if model_type == 'policy':
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_P, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_P))
        model.add(Dense(NUM_ACTION, init=INITIALIZATION_P))
        model.add(Activation('softmax'))
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def get_trainable_params(model):
    params = []
    for layer in model.layers:
        params += training.collect_trainable_weights(layer)
    return params

In [None]:
def advantage_actor_critic(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}
    
    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode transitions & stats
        transitions = []
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)
            
            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)
            
            # Keep track of the episode transition and the probability of the action taken
            transitions.append((s_t, a_t, r_t, probs[a_t]))
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
                
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{}".format(\
                EPOCH, step, episode, NUM_EPISODE))

            if done or step > 10000: break 
            
            # Update state
            s_t = s_t1
            
        # Update policy after each episode 
        for t, transition in enumerate(transitions):
        
            # Calculate target for action taken
            # -> Total discounted reward after this step during episode
            target = sum(GAMMA**i * j[2] for i, j in enumerate(transitions[t:])) 

            # Update value estimator
            model_v.fit(transition[0][np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(transition[0][np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(transition[3]) * advantage
            
            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
    
    return stats

In [None]:
STEP_MEM = 1
NUM_HIDDEN_NEURON = 300
INITIALIZATION_V = 'glorot_uniform'
INITIALIZATION_P = 'glorot_uniform'
ACTIVATION_V = 'relu'
ACTIVATION_P = 'relu'

# Create model id
EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

# Initialize models
try: model_v, model_p = _create_network('value'), _create_network('policy') 
except Exception: raise
                    
# Train model / Play epoch
stats = advantage_actor_critic(model_v, model_p) 

# Calculate cumulative epoch statistics
highest_reward = max([ v[0] for v in stats.values() ])
success_episodes = sum([ v[1] for v in stats.values() ])
solved_episodes = sum([ v[2] for v in stats.values() ])
                                
# Visualize
stdout.write("\rEpoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
   EPOCH, highest_reward, success_episodes, solved_episodes))
                        
# Memorize
if success_episodes > 0: 
    epoch_account[EPOCH] = [highest_reward, success_episodes, solved_episodes]
    with open(path.join(SAVE_PATH, "AAC_Stats.json"), "a") as outfile: json.dump(epoch_account, outfile) 
            

In [None]:
'''
= # Init epoch overview
epoch_account = {}

# Apply brute force
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        for INITIALIZATION_V in INITIALIZATIONS:
            for INITIALIZATION_P in INITIALIZATIONS:
                for ACTIVATION_V in ACTIVATIONS:
                    for ACTIVATION_P in ACTIVATIONS:
                        # Initialize models
                        try: model_v, model_p = _create_network('value'), _create_network('policy') 
                        except Exception: raise
                        
                        # Create model id
                        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])
                                     
                        # Train model / Play epoch
                        stats = advantage_actor_critic(model_v, model_p) 

                        # Calculate cumulative epoch statistics
                        highest_reward = max([ v[0] for v in stats.values() ])
                        success_episodes = sum([ v[1] for v in stats.values() ])
                        solved_episodes = sum([ v[2] for v in stats.values() ])
                                
                        # Visualize
                        stdout.write("\rEpoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
                            EPOCH, highest_reward, success_episodes, solved_episodes))
                        
                        # Memorize
                        if success_episodes > 0: 
                            epoch_account[EPOCH] = [highest_reward, success_episodes, solved_episodes]
                            with open(path.join(SAVE_PATH, "AAC_Stats.json"), "a") as outfile: json.dump(epoch_account, outfile) 
            
            '''

TODO Cannot be tested on Win server, since TensorFlow presently cannot be run on Windows.

## Plot Learning Progress

In [None]:
episodes = [v+1 for v in range(NUM_EPISODE)]
titles = ['Sum Reward', 'Steps', '# Successes', '# Solved']
fontsize = 14

from os import listdir
file_count = 0
for file in listdir(SAVE_PATH):
    if "AAC_Stats" in file: # only AAC ###TODO
        file_count += 1
        json_data = open(path.join(SAVE_PATH, file))
        data = json.load(json_data)
        
        plt.figure(file_count, figsize=(18,6), dpi=320)
        for i in range(len(titles)):
            plt.subplot(2,2,i+1)
            plt.plot(episodes, [ v[i] for v in chain.from_iterable([v.values() for v in data.values()]) ], color='blue')
            plt.xlabel("Episodes", fontsize=fontsize)
            plt.ylabel("AAC: "+titles[i], fontsize=fontsize)