# Reinforcement Learning Part 2: Policy Gradients

## Actor-Critic Policy Gradients

In [1]:
%matplotlib inline

import numpy as np
import gym
from keras.engine import training
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation
import tensorflow as tf
from sys import stdout
from os import path, makedirs
import json
from itertools import count
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
np.random.seed(0)

In [3]:
ENV = gym.make("LunarLander-v2")
NUM_INPUT = ENV.reset().shape[0]
NUM_ACTION = ENV.action_space.n
ACTIONS = np.arange(0, NUM_ACTION)

[2016-11-10 19:59:46,702] Making new env: LunarLander-v2


In [4]:
NUM_EPISODE = 50
 
STEP_MEM_RANGE = np.arange(5, 6) # Constant over one epoch
NUM_HIDDEN_NEURON_RANGE = [256] # Constant over one epoch
INITIALIZATIONS = ['lecun_uniform', 'normal', 'identity', 'orthogonal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] # Constant over one epoch
# 'uniform', 
ACTIVATIONS1 = ['softplus']#['softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] # Constant over one epoch
ACTIVATIONS2 = ['relu']
GAMMA = 0.99
 
SAVE_PATH = path.join(path.expanduser("~"), "RL_Saves")
try:
    makedirs(SAVE_PATH)
except OSError:
    if not path.isdir(SAVE_PATH):
        raise

In [5]:
def _create_network(model_type): 
    model = Sequential()
    
    if model_type == 'value':
        # Regression function estimate for calculating the advantage
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_V, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_V))
        model.add(Dense(1, init=INITIALIZATION_V))
        model.compile(optimizer='rmsprop', loss='mse')
    
    if model_type == 'policy':
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_P, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_P))
        model.add(Dense(NUM_ACTION, init=INITIALIZATION_P))
        model.add(Activation('softmax'))
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [6]:
def get_trainable_params(model):
    params = []
    for layer in model.layers:
        params += training.collect_trainable_weights(layer)
    return params

In [None]:
def q_actor_critic(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}

    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False
        
        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()

            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the returned probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)

            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)

            # Calculate target for action taken
            # -> Reward for action taken at s_t1
            Q_sa = model_v.predict(s_t1[np.newaxis])[0][0]
            target = r_t + GAMMA * Q_sa # if not done else r_t ###TODO reinnehmen?

            # Update value estimator
            model_v.fit(s_t[np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(s_t[np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(probs[a_t]) * advantage

            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1

            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{}".format(\
                EPOCH, step, episode, NUM_EPISODE))

            # Exit episode after crash or deadlock
            if done or step > 10000: break 

            # Update state
            s_t = s_t1
            
    return stats

In [None]:
# Init epoch overview
epoch_account = {}

# Apply brute force
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        for INITIALIZATION_V in INITIALIZATIONS:
            for INITIALIZATION_P in INITIALIZATIONS:
                for ACTIVATION_V in ACTIVATIONS1:
                    for ACTIVATION_P in ACTIVATIONS2:
                        # Initialize models
                        try: model_v, model_p = _create_network('value'), _create_network('policy') 
                        except Exception: continue
                        
                        # Create model id
                        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

                        # Train model / Play epoch
                        stats = q_actor_critic(model_v, model_p) 

                        # Calculate cumulative epoch statistics
                        highest_reward = max([ v[0] for v in stats.values() ])
                        success_episodes = sum([ v[1] for v in stats.values() ])
                        solved_episodes = sum([ v[2] for v in stats.values() ])
                                
                        # Visualize
                        stdout.write("\rEpoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
                            EPOCH, highest_reward, success_episodes, solved_episodes))

                        # Memorize
                        if success_episodes > 0: 
                            epoch_account[EPOCH] = [highest_reward, success_episodes, solved_episodes]
                            with open(path.join(SAVE_PATH, "QAC_Stats.json"), "a") as outfile: json.dump(epoch_account, outfile)

5_256_lecun_uniform_lecun_uniform_softplus_relu | Step 72 @ Episode 9/50