# Reinforcement Learning Part 2: Policy Gradients

In [1]:
%matplotlib inline

import numpy as np
import gym
from keras.engine import training
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation
import tensorflow as tf
from sys import stdout
from os import path, makedirs
import json
from itertools import count
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
np.random.seed(0)

In [3]:
ENV = gym.make("LunarLander-v2")
NUM_INPUT = ENV.reset().shape[0]
NUM_ACTION = ENV.action_space.n
ACTIONS = np.arange(0, NUM_ACTION)

[2016-11-07 22:11:21,066] Making new env: LunarLander-v2


In [4]:
NUM_EPISODE = 10**3 #TODO include again, set to 10**7
 
STEP_MEM_RANGE = np.arange(1, 31) # Constant over one epoch
NUM_HIDDEN_NEURON_RANGE = [2**i for i in np.arange(11)] # Constant over one epoch
INITIALIZATIONS = ['uniform', 'lecun_uniform', 'normal', 'identity', 'orthogonal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] # Constant over one epoch
ACTIVATIONS = ['softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] # Constant over one epoch
GAMMA = 0.99
 
SAVE_PATH = path.join(path.expanduser("~"), "RL_Saves")
try:
    makedirs(SAVE_PATH)
except OSError:
    if not path.isdir(SAVE_PATH):
        raise

In [5]:
def _create_network(model_type): 
    model = Sequential()
    
    if model_type == 'value':
        # Regression function estimate for calculating the advantage
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_V, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_V))
        model.add(Dense(1, init=INITIALIZATION_V))
        model.compile(optimizer='rmsprop', loss='mse')
    
    if model_type == 'policy':
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_P, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_P))
        model.add(Dense(NUM_ACTION, init=INITIALIZATION_P))
        model.add(Activation('softmax'))
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [6]:
def get_trainable_params(model):
    params = []
    for layer in model.layers:
        params += training.collect_trainable_weights(layer)
    return params

## Advantage Actor-Critic Policy Gradients

In [7]:
def advantage_actor_critic(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}
    
    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode transitions & stats
        transitions = []
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)
            
            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)
            
            # Keep track of the episode transition and the probability of the action taken
            transitions.append((s_t, a_t, r_t, probs[a_t]))
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            
            if done or step > 10000: break 
            
            # Update state
            s_t = s_t1
            
        # Update policy after each episode 
        for t, transition in enumerate(transitions):
        
            # Calculate target for action taken
            # -> Total discounted reward after this step during episode
            target = sum(GAMMA**i * j[2] for i, j in enumerate(transitions[t:])) 

            # Update value estimator
            model_v.fit(transition[0][np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(transition[0][np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(transition[3]) * advantage
            
            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
    
    return stats

## Q Actor-Critic Policy Gradients

In [None]:
def q_actor_critic(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}

    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False
        
        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()

            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the returned probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)

            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)

            # Calculate target for action taken
            # -> Reward for action taken at s_t1
            Q_sa = model_v.predict(s_t1[np.newaxis])[0][0]
            target = r_t + GAMMA * Q_sa # if not done else r_t ###TODO reinnehmen?

            # Update value estimator
            model_v.fit(s_t[np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(s_t[np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(probs[a_t]) * advantage

            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            
            if done or step > 10000: break 

            # Update state
            s_t = s_t1
            
    return stats

## Train Models

In [None]:
# Init epoch overviews
epoch_account_aac = {}
epoch_account_qac = {}

# Apply brute force
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        for INITIALIZATION_V in INITIALIZATIONS:
            for INITIALIZATION_P in INITIALIZATIONS:
                for ACTIVATION_V in ACTIVATIONS:
                    for ACTIVATION_P in ACTIVATIONS:
                        # Initialize models
                        try: model_v, model_p = _create_network('value'), _create_network('policy') 
                        except Exception: continue
                        
                        # Create model id
                        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

                        # Train model / Play epoch
                        stats_aac = advantage_actor_critic(model_v, model_p) 
                        stats_qac = q_actor_critic(model_v, model_p) 

                        # Calculate epoch stats
                        highest_reward_aac = max([ v[0] for v in stats_aac.values() ])
                        highest_reward_qac = max([ v[0] for v in stats_qac.values() ])
                        success_episodes_aac = sum([ v[1] for v in stats_aac.values() ])
                        success_episodes_qac = sum([ v[1] for v in stats_qac.values() ])
                        solved_episodes_aac = sum([ v[2] for v in stats_aac.values() ])
                        solved_episodes_qac = sum([ v[2] for v in stats_qac.values() ])

                        # Visualize
                        stdout.write("\rAAC - Epoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
                                EPOCH, highest_reward_aac, success_episodes_aac, solved_episodes_aac))
                        stdout.write("\rQAC - Epoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
                                EPOCH, highest_reward_qac, success_episodes_qac, solved_episodes_qac))

                        # Memorize
                        if success_episodes_aac > 0: 
                            epoch_account_aac[EPOCH] = [highest_reward_aac, success_episodes_aac, solved_episodes_aac]
                        if success_episodes_qac > 0: 
                            epoch_account_qac[EPOCH] = [highest_reward_qac, success_episodes_qac, solved_episodes_qac]
                        
# Write out
with open(path.join(SAVE_PATH, "AAC_Stats.json"), "w") as outfile_aac: json.dump(epoch_account_aac, outfile_aac) 
with open(path.join(SAVE_PATH, "QAC_Stats.json"), "w") as outfile_qac: json.dump(epoch_account_qac, outfile_qac) 

## Plot Learning Progress

In [None]:
episodes = [v+1 for v in range(NUM_EPISODE)]
titles = ['Sum Reward', 'Steps', '# Successes', '# Solved']
fontsize = 14

from os import listdir
file_count = 0
for file in listdir(SAVE_PATH):
    if "AAC_Stats" in file: # only AAC ###TODO
        file_count += 1
        json_data = open(path.join(SAVE_PATH, file))
        data = json.load(json_data)
        
        plt.figure(file_count, figsize=(18,6), dpi=320)
        for i in range(len(titles)):
            plt.subplot(2,2,i+1)
            plt.plot(episodes, [ v[i] for v in chain.from_iterable([v.values() for v in data.values()]) ], color='blue')
            plt.xlabel("Episodes", fontsize=fontsize)
            plt.ylabel("AAC: "+titles[i], fontsize=fontsize)