In [7]:
import numpy as np
import gym

np.random.seed(3293734)

ENV = gym.make("LunarLander-v2")
INPUT_DIM = ENV.reset().shape[0]
N_ACTIONS = ENV.action_space.n
ACTIONS = np.arange(0, N_ACTIONS)

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.engine import training ###TODO extension
import tensorflow as tf ###TODO extension

from sys import stdout

def _create_network_pol():
    model = Sequential()
    model.add(Dense(HIDDEN_DIM, init='glorot_normal', input_dim=D*INPUT_DIM))  # input_shape=(D*INPUT_DIM,)
    model.add(Activation('relu'))
    model.add(Dense(N_ACTIONS, init='glorot_normal'))
    model.add(Activation('softmax'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) # categorical_crossentropy
    return model

# Regression function estimate for calculating the baseline/advantage	
def _create_network_val():
    model = Sequential()
    model.add(Dense(64, init='glorot_uniform', input_dim=D*INPUT_DIM)) # input_shape=(D*INPUT_DIM,)
    model.add(Activation('tanh'))
    model.add(Dense(1, init='glorot_uniform')) # input_shape=(D*INPUT_DIM,)
    model.compile(loss='mse', optimizer='rmsprop') # Adam(lr=1e-6)
    return model

# https://github.com/fchollet/keras/issues/3062
def get_trainable_params(model):
    params = []
    for layer in model.layers:
        params += training.collect_trainable_weights(layer)
    return params

D = 1
HIDDEN_DIM = 200
model_pol = _create_network_pol()
model_val = _create_network_val()

print model_val.inputs
print model_val.outputs
print model_val.metrics_names

print model_pol.inputs
print model_pol.outputs
print model_pol.metrics_names


for episode in range(50):

    # Instantiate interactions list
    interactions = list()
    discount_factor=0.99 # TODO parametrize brute force

    s_t = ENV.reset()  # TODO stack
    done = False

    # Start an episode
    while not done:
        ###ENV.render()
        probs = model_pol.predict(s_t[np.newaxis])[0]
        # Take an action: Sample an action from the returned probabilities distribution
        a_t = np.random.choice(ACTIONS, p=probs)
        ###stdout.write('\r'+str(a_t))
        ###stdout.flush

        # step the environment and get new measurements
        s_t1, r_t, done, info = ENV.step(a_t)

        # Keep track of the transition and the probabilities
        interactions.append((s_t, a_t, r_t, probs))

        # Update state
        s_t = s_t1

    # Ex post: Go through the episode and make policy updates
    for t, transition in enumerate(interactions):
        
        # The return after this timestep (Discount reward as of frame n of game)
        total_return = sum(discount_factor**i * j[2] for i, j in enumerate(interactions[t:])) 
        
        # Get state at t, ex post
        ep_s_t = transition[0][np.newaxis]
        
        # Update value estimator
        model_val.fit(ep_s_t, np.asarray([total_return]), nb_epoch=10, verbose=0) # TODO nb_epoch=10
        
        # Calculate baseline FOR THE PICKED ACTION ONLY (Regression task)
        baseline_value = model_val.predict(ep_s_t)[0][0]
        
        # Calculate advantage FOR THE PICKED ACTION ONLY  (target / y_true)
        advantage = total_return - baseline_value
        
        # Update policy estimator
        # Get the targets
        targets = transition[3]
        # Define the loss 
        #self.loss = -tf.log(self.picked_action_prob) * self.target
        loss = -np.log(targets[a_t]) * advantage # -tf.log
        # Compute the gradients
        network_params = get_trainable_params(model_pol)
        param_grad = tf.gradients(loss, network_params)
        # (cannot use standard model.fit keras)
        # https://github.com/fchollet/keras/issues/3062 in references on top
        ###estimator_policy.update(transition.state, advantage, transition.action)
        
    print targets, np.sum(np.array(targets))
        


[2016-10-28 23:06:07,220] Making new env: LunarLander-v2


[<tf.Tensor 'dense_input_12:0' shape=(?, 8) dtype=float32>]
[<tf.Tensor 'add_63:0' shape=(?, 1) dtype=float32>]
['loss']
[<tf.Tensor 'dense_input_11:0' shape=(?, 8) dtype=float32>]
[<tf.Tensor 'Softmax_5:0' shape=(?, 4) dtype=float32>]
['loss', 'acc']
[ 0.18308531  0.17087589  0.27821431  0.36782447] 1.0
[ 0.27199739  0.19958311  0.24294177  0.28547773] 1.0
[ 0.2735799   0.1953778   0.23712538  0.29391688] 1.0
[ 0.26688197  0.06791279  0.35919517  0.3060101 ] 1.0
[ 0.24937166  0.24207726  0.22923347  0.27931756] 1.0
[ 0.27044809  0.20189556  0.24371576  0.28394064] 1.0
[ 0.23314312  0.218219    0.22661613  0.32202175] 1.0
[ 0.27347621  0.21600142  0.2416352   0.26888713] 1.0
[ 0.26879242  0.17499436  0.22281675  0.33339649] 1.0
[ 0.27970156  0.20455715  0.25126079  0.26448053] 1.0
[ 0.18919045  0.19687845  0.26634994  0.34758124] 1.0
[ 0.21931063  0.22331965  0.26479807  0.29257163] 1.0
[ 0.28085199  0.21148805  0.23273998  0.27491999] 1.0
[ 0.1907233   0.18381749  0.29057893  0.334880