Imports

In [1]:
import gym
import tensorflow as tf
import numpy as np
np.set_printoptions(precision=3)
import random
import datetime
import itertools

import matplotlib
#matplotlib.use("Pdf")
import matplotlib.pyplot as plt
%matplotlib inline

Create the session. An interactive session is a session that is automatically your default session.

In [2]:
sess = tf.InteractiveSession()

My own little library of helper functions!

In [3]:
def fc(input_tensor, in_size, out_size, collection=None, non_linearity=tf.nn.relu):
    
    collections = [tf.GraphKeys.VARIABLES, tf.GraphKeys.TRAINABLE_VARIABLES]
    if collection is not None:
        collections.append(collection)
        
    W = tf.get_variable(
        name="W",
        initializer=tf.truncated_normal(
            shape=[in_size, out_size],
            mean=0.0,
            stddev=0.1
        ),
        collections=collections
    )
    
    b = tf.get_variable(
        name="b",
        initializer=tf.constant(
            value=0.1,
            shape=[out_size]
        ),
        collections=collections
    )
    
    return non_linearity( tf.matmul(input_tensor, W) + b )

# fc test
fc_test_template = tf.make_template("fc_test_template", fc, in_size=2, out_size=2)
op = fc_test_template([[0., 1.]])
tf.initialize_all_variables().run()
print op.eval()

def fc_stack(input_tensor, list_of_sizes, collection=None):
    result = input_tensor
    
    for layer_index in xrange(len(list_of_sizes)-1):
        
        #I think this is the wrong way to do it:
#         current_layer = tf.make_template(
#             "fc"+str(layer_index),
#             fc,
#             in_size=list_of_sizes[layer_index],
#             out_size=list_of_sizes[layer_index+1],
#             collection=collection
#         )
        
#         result = current_layer(result)

        with tf.variable_scope("layer"+str(layer_index)):
        
            if layer_index == len(list_of_sizes)-2:
                non_linearity = tf.identity
            else:
                non_linearity = tf.nn.relu
        
            result = fc(
                result,        
                in_size=list_of_sizes[layer_index],
                out_size=list_of_sizes[layer_index+1],
                collection=collection,
                non_linearity=non_linearity
            )
        
    return result

# fc_stack test
fc_stack_test_template = tf.make_template("fc_stack_test_template", fc_stack, list_of_sizes=[2, 3, 2])
op = fc_stack_test_template([[0., 1.]])
tf.initialize_all_variables().run()
print op.eval()

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def makePlaceholder(dtype=tf.float32, shape=None, name=None):
    newPlaceholder = tf.placeholder(dtype, shape, name)
    print name + ":", newPlaceholder
    return newPlaceholder

def one_hot(index, name="one_hot"):
    return tf.one_hot(indices=index, depth=env.observation_space.n, on_value=1, off_value=0, axis=None, name=name).eval()

def getFeedDict(observation):
    if type(env.observation_space) == gym.spaces.discrete.Discrete:
        return {x:[one_hot(observation)]}
    elif type(env.observation_space) == gym.spaces.box.Box:
        return {x:[observation]}
    else:
        print "ERR"

[[ 0.074  0.164]]
[[ 0.143  0.108]]


Choose an environment

In [4]:
# env = gym.make('ConfirmationBiasEasy-v0')
# env = gym.make('FrozenLake-v0')
# env = gym.make('TwoRoundNondeterministicReward-v0')
# env = gym.make('CartPole-v0')
env = gym.make("Pendulum-v0")

print(env.action_space)
print(env.observation_space)

[2016-06-07 18:38:29,574] Making new env: Pendulum-v0


Box(1,)
Box(3,)


Generate some random data with which to build our env model

In [31]:
oldObservations = []
actions = []
observations = []
rewards = []
dones = []
initialObservations = []

In [36]:
for i_episode in range(50):
    observation = env.reset()
    observation = observation.astype(np.float32)
    initialObservations.append(observation)
    
    for timestep in xrange(50):
        env.render()
        
        action = env.action_space.sample() #choose a random action
#         action = agent(tf.expand_dims(observation, 0)).eval()[0]
        
        oldObservation = observation
        
        observation, reward, done, info = env.step(action)
        observation = observation.astype(np.float32)
        
        oldObservations.append(oldObservation)
        actions.append(action)
        observations.append(observation)
        rewards.append(reward)
        dones.append(done)
        
        if done:
            print("Episode finished after {} timesteps".format(timestep+1))
            break

In [6]:
with tf.name_scope('envModel'):
    envModelInsData = np.concatenate((np.asarray(oldObservations), np.asarray(actions)), 1)
    print "envModelInsData.shape:", envModelInsData.shape
    
    envModelOutsData = np.concatenate((np.asarray(observations), np.asarray([rewards]).T, np.asarray([dones]).T), 1)
    print "envModelOutsData.shape:", envModelOutsData.shape
    

envModel = tf.make_template(
    "envModel",
    fc_stack,
    list_of_sizes=[envModelInsData.shape[1], 10, 10, envModelOutsData.shape[1]],
    collection="envModelVars"
)

envModelPredictedObservation = tf.make_template(
    'envModelPredictedObservation',
    lambda envModelPredictedOuts: tf.slice(
        envModelPredictedOuts,
        begin=[0, 0],
        size=[-1, envModelOutsData.shape[1]-2],
        name="envModelPredictedDone"
    )
)

envModelPredictedReward = tf.make_template(
    'envModelPredictedReward',
    lambda envModelPredictedOuts: tf.slice(
        envModelPredictedOuts,
        begin=[0, envModelOutsData.shape[1]-2],
        size=[-1, 1],
        name="envModelPredictedDone"
    )    
)

envModelPredictedDone = tf.make_template(
    'envModelPredictedDone',
    lambda envModelPredictedOuts: tf.slice(
        envModelPredictedOuts,
        begin=[0, envModelOutsData.shape[1]-1],
        size=[-1, 1],
        name="envModelPredictedDone"
    )
)

with tf.name_scope('envModel'):
    envModelIns = makePlaceholder(shape=[None, envModelInsData.shape[1]], name='ins')
    envModelOuts = makePlaceholder(shape=[None, envModelOutsData.shape[1]], name='outs')
    envModelPredictedOuts = envModel(envModelIns)
    print "envModelPredictedOuts:", envModelPredictedOuts

    envModelLoss = tf.reduce_mean(tf.square(tf.sub(envModelPredictedOuts, envModelOuts)))
    print "Loss:", envModelLoss

envModelInsData.shape: (2500, 4)
envModelOutsData.shape: (2500, 5)
ins: Tensor("envModel_1/ins:0", shape=(?, 4), dtype=float32)
outs: Tensor("envModel_1/outs:0", shape=(?, 5), dtype=float32)
envModelPredictedOuts: Tensor("envModel_1/envModel/layer2/Identity:0", shape=(?, 5), dtype=float32)
Loss: Tensor("envModel_1/Mean:0", shape=(), dtype=float32)


In [7]:
agent = tf.make_template(
    "agent",
    fc_stack,
    list_of_sizes=[3, 10, 10, 1],
    collection="agentVars"
)

In [8]:
probNotDone = tf.Variable(tf.constant(1.), trainable=False, name="probNotDone")
totalPredictedReward = tf.Variable(tf.constant(0.), trainable=False, name="totalPredictedReward")

envModelPredictedObservation = np.asarray(initialObservations)

for round in xrange(50): # 200
    action = agent(envModelPredictedObservation)
    
    envModelPredictedOuts = envModel(tf.concat(1, [envModelPredictedObservation, action]))
    
    rewardFromThisRound = tf.mul(envModelPredictedReward(envModelPredictedOuts), probNotDone)
    totalPredictedReward = tf.add(totalPredictedReward, rewardFromThisRound)
    probNotDone *= 1 - envModelPredictedDone(envModelPredictedOuts)
    
agentLoss = -tf.reduce_mean(totalPredictedReward)

Now we actually are going to run and train things

In [18]:
envModelOptimizer = {}
# envModelOptimizer['AdagradOptimizer'] = tf.train.AdagradOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars"))
# envModelOptimizer['AdadeltaOptimizer'] = tf.train.AdadeltaOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars"))
# envModelOptimizer['GradientDescentOptimizer'] = tf.train.GradientDescentOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars"))
# envModelOptimizer['MomentumOptimizer'] = tf.train.MomentumOptimizer(.01, momentum=0.1).minimize(envModelLoss, var_list=tf.get_collection("envModelVars"))
envModelOptimizer['AdamOptimizer'] = tf.train.AdamOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars")) #BEST
# envModelOptimizer['FtrlOptimizer'] = tf.train.FtrlOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars"))
# envModelOptimizer['RMSPropOptimizer'] = tf.train.RMSPropOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envModelVars")) # GOOD

agentOptimizer = {}
agentOptimizer['AdagradOptimizer'] = tf.train.AdagradOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars"))
agentOptimizer['AdadeltaOptimizer'] = tf.train.AdadeltaOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars"))
agentOptimizer['GradientDescentOptimizer'] = tf.train.GradientDescentOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars"))
agentOptimizer['MomentumOptimizer'] = tf.train.MomentumOptimizer(.01, momentum=0.1).minimize(agentLoss, var_list=tf.get_collection("agentVars"))
agentOptimizer['AdamOptimizer'] = tf.train.AdamOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars")) #BEST
agentOptimizer['FtrlOptimizer'] = tf.train.FtrlOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars"))
agentOptimizer['RMSPropOptimizer'] = tf.train.RMSPropOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVars")) # GOOD

In [19]:
tf.initialize_all_variables().run()

In [41]:
print "Training the environment model on " + str(len(observations)) + " observations"

for i in xrange(3000):
    miniBatchIndices = np.random.randint(envModelInsData.shape[0], size=100)
    
    envModelOptimizer['AdamOptimizer'].run(feed_dict={
        envModelIns: envModelInsData[miniBatchIndices, :],
        envModelOuts: envModelOutsData[miniBatchIndices, :]
    })
    
    if i % 100 is 0:
        print "envModelLoss:", envModelLoss.eval(feed_dict={
            envModelIns: envModelInsData,
            envModelOuts: envModelOutsData
        })

Training the environment on 10100 observations
envModelLoss: 0.0104267
envModelLoss: 0.00661189
envModelLoss: 0.00665769
envModelLoss: 0.00580353
envModelLoss: 0.00672594
envModelLoss: 0.00753446
envModelLoss: 0.00878808
envModelLoss: 0.00742256
envModelLoss: 0.00629922
envModelLoss: 0.00627531
envModelLoss: 0.00615674
envModelLoss: 0.00612343
envModelLoss: 0.00618922
envModelLoss: 0.00591131
envModelLoss: 0.00693076
envModelLoss: 0.00839876
envModelLoss: 0.00694833
envModelLoss: 0.0101185
envModelLoss: 0.00665616
envModelLoss: 0.00583771
envModelLoss: 0.00628071
envModelLoss: 0.00652909
envModelLoss: 0.00566679
envModelLoss: 0.00566707
envModelLoss: 0.00661901
envModelLoss: 0.00576583
envModelLoss: 0.00700021
envModelLoss: 0.00661019
envModelLoss: 0.00606961
envModelLoss: 0.007385


In [23]:
for iteration in xrange(300):
    agentOptimizer['AdamOptimizer'].run()
    
    if iteration % 10 is 0:
        print agentLoss.eval()

127.394
126.924
125.793
123.738
123.387
123.016
122.99
122.941
122.917
122.901
122.885
122.861
122.824
122.787
122.739
122.665
122.573
122.481
122.386
122.293
122.211
122.141
122.078
122.025
121.986
121.961
121.943
121.92
121.902
121.882


In [25]:
print totalPredictedReward.eval()

[[-111.216]
 [ -68.236]
 [-183.337]
 [ -55.46 ]
 [-244.474]
 [-282.559]
 [  10.617]
 [ -46.66 ]
 [   3.585]
 [-354.991]
 [  -9.498]
 [-139.527]
 [   8.546]
 [-473.339]
 [ -37.082]
 [  -3.563]
 [-258.51 ]
 [-208.396]
 [-423.167]
 [ -42.212]
 [-100.861]
 [   9.822]
 [ -41.376]
 [-222.225]
 [   9.889]
 [  -3.838]
 [-416.09 ]
 [-154.02 ]
 [-301.721]
 [ -27.022]
 [-183.664]
 [ -52.556]
 [ -62.272]
 [ -65.923]
 [   6.224]
 [  -3.022]
 [-198.112]
 [ -43.505]
 [ -99.217]
 [-119.066]
 [ -28.63 ]
 [  -0.719]
 [-103.654]
 [-262.513]
 [ -55.13 ]
 [  -9.855]
 [-100.202]
 [ -58.303]
 [-446.443]
 [ -39.749]]


In [24]:
saver = tf.train.Saver(envModelWs + envModelbs)
saver.save(sess, "envModelVars")

NameError: name 'envModelWs' is not defined

In [None]:
saver.restore(sess, "envModelVars")

In [30]:
oldObservations

[array([ 0.968, -0.252, -0.478], dtype=float32),
 array([ 0.954, -0.298, -0.967], dtype=float32),
 array([ 0.93 , -0.369, -1.491], dtype=float32),
 array([ 0.887, -0.461, -2.034], dtype=float32),
 array([ 0.821, -0.571, -2.579], dtype=float32),
 array([ 0.718, -0.696, -3.221], dtype=float32),
 array([ 0.564, -0.826, -4.042], dtype=float32),
 array([ 0.344, -0.939, -4.962], dtype=float32),
 array([ 0.053, -0.999, -5.966], dtype=float32),
 array([-0.293, -0.956, -7.015], dtype=float32)]