Imports

In [1]:
import gym
import tensorflow as tf
import numpy as np
np.set_printoptions(precision=3)
import random
import datetime
import itertools

import matplotlib
#matplotlib.use("Pdf")
import matplotlib.pyplot as plt
%matplotlib inline

My own little library of helper functions!

In [56]:
def weight_variable(shape, collections=None, name=None):
    initial = tf.truncated_normal(shape, mean=0.0, stddev=0.1)
    newVar = tf.Variable(initial, collections=collections, name=name)
    print name + ":", newVar
    return newVar

def bias_variable(shape, collections=None, name=None):
    initial = tf.constant(0.0, shape=shape)
    newVar = tf.Variable(initial, collections=collections, name=name)
    print name + ":", newVar
    return newVar

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def makePlaceholder(dtype=tf.float32, shape=None, name=None):
    newPlaceholder = tf.placeholder(dtype, shape, name)
    print name + ":", newPlaceholder
    return newPlaceholder

# Example usage:
# with tf.name_scope("agent"):
#     with tf.name_scope("layer1"):
#         action = fc(observation, 1)
def fc(inputTensor, inNeurons, outNeurons, extraCollection=None):
    
    collections = [tf.GraphKeys.VARIABLES]
    if extraCollection is not None:
        collections.append(extraCollection)
    
    W = weight_variable([inNeurons, outNeurons], collections=collections, name="W")
    b = bias_variable([outNeurons], collections=collections, name="b")
    return tf.nn.relu( tf.matmul(inputTensor, W) + b )

def fc_stack(inputTensor, listOfLayerSizes, extraCollection=None):
    
    intermediate = inputTensor
    
    for layerIndex in xrange(len(listOfLayerSizes) - 1):
        with tf.name_scope("layer" + str(layerIndex)):
            intermediate = fc(intermediate, listOfLayerSizes[layerIndex], listOfLayerSizes[layerIndex+1], extraCollection)
    
    return intermediate

In [57]:
observation = makePlaceholder(shape=[None, 3], name="observation")

with tf.name_scope("agent"):
    action = fc_stack(observation, [3, 10, 1], extraCollection="agentVars")

observation: Tensor("observation_10:0", shape=(?, 3), dtype=float32)
W: <tensorflow.python.ops.variables.Variable object at 0x11fb48d50>
b: <tensorflow.python.ops.variables.Variable object at 0x11fb09090>
W: <tensorflow.python.ops.variables.Variable object at 0x11fb69c90>
b: <tensorflow.python.ops.variables.Variable object at 0x11a66ca90>


Create the session. An interactive session is a session that is automatically your default session.

In [49]:
sess = tf.InteractiveSession()

Exception AssertionError: AssertionError() in <bound method InteractiveSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x11a1e41d0>> ignored


Choose an environment

In [4]:
# env = gym.make('ConfirmationBiasEasy-v0')
# env = gym.make('FrozenLake-v0')
# env = gym.make('TwoRoundNondeterministicReward-v0')
# env = gym.make('CartPole-v0')
env = gym.make("Pendulum-v0")

print(env.action_space)
print(env.observation_space)

[2016-06-07 09:35:33,299] Making new env: Pendulum-v0


Box(1,)
Box(3,)


Currently unused

In [5]:
def one_hot(index, name="one_hot"):
    return tf.one_hot(indices=index, depth=env.observation_space.n, on_value=1, off_value=0, axis=None, name=name).eval()

In [6]:
def getFeedDict(observation):
    if type(env.observation_space) == gym.spaces.discrete.Discrete:
        return {x:[one_hot(observation)]}
    elif type(env.observation_space) == gym.spaces.box.Box:
        return {x:[observation]}
    else:
        print "ERR"

Generate some random data with which to build our env model

In [7]:
oldObservations = []
actions = []
observations = []
rewards = []
dones = []
initialObservations = []

for i_episode in range(50):
    observation = env.reset()
    observation = observation.astype(np.float32)
    initialObservations.append(observation)
    
    for timestep in xrange(50):
        env.render()
        
#         epsilon = 1.0
#         if random.random() < epsilon:
        action = env.action_space.sample() #choose a random action
        
        oldObservation = observation
        
        observation, reward, done, info = env.step(action)
        observation = observation.astype(np.float32)
        
        oldObservations.append(oldObservation)
        actions.append(action)
        observations.append(observation)
        rewards.append(reward)
        dones.append(done)
        
        if done:
            print("Episode finished after {} timesteps".format(timestep+1))
            break

In [None]:
print "\n=== data ==="
envModelInsData = np.concatenate((np.asarray(oldObservations), np.asarray(actions)), 1)
print "envModelInsData.shape:", envModelInsData.shape
envModelOutsData = np.concatenate((np.asarray(observations), np.asarray([rewards]).T, np.asarray([dones]).T), 1)
print "envModelOutsData.shape:", envModelOutsData.shape
envModelInitialObservationData = np.asarray(initialObservations)
print "envModelInitialObservationData.shape:", envModelInitialObservationData.shape

print "\n=== placeholders ==="
envModelIns = makePlaceholder(shape=[None, envModelInsData.shape[1]], name="envModelIns")
envModelOuts = makePlaceholder(shape=[None, envModelOutsData.shape[1]], name="envModelOuts")
envModelInitialObservationOuts = makePlaceholder(shape=[None, 3], name="envModelInitialObservationOuts")

print "\n=== env variables ==="
envVariablesCollections = [tf.GraphKeys.VARIABLES, "envVariables"]

layerSizes = [envModelInsData.shape[1], 10, 10, envModelOutsData.shape[1]]

envModelWs = [weight_variable(
    [layerSizes[i], layerSizes[i+1]],
    collections=envVariablesCollections,
    name="envModelW" + str(i))
              for i in xrange(3)]

envModelbs = [bias_variable(
    [layerSizes[i+1]],
    collections=envVariablesCollections,
    name="envModelb" + str(i))
              for i in xrange(3)]

envModelInitialObservationb = bias_variable(
    [envModelOutsData.shape[1] - 2],
    collections=envVariablesCollections,
    name="envModelInitialObservationb")

print "\n === env model internals ==="
def envModel(inputs=None):
    if inputs is not None:
        
        intermediate = inputs
        for i in xrange(3):
            intermediate = tf.matmul(intermediate, envModelWs[i]) + envModelbs[i]
            if i is not 2:
                intermediate = tf.nn.relu(intermediate)
                
        envModelPredictedOuts = intermediate
                
        print "envModelPredictedOuts:", envModelPredictedOuts

        # tf.slice(input_, begin, size, name=None)
        return envModelPredictedOuts, \
                tf.slice(envModelPredictedOuts,
                    begin=[0, 0],
                    size=[-1, envModelOutsData.shape[1]-2],
                    name="envModelPredictedDone"), \
                tf.slice(envModelPredictedOuts,
                   begin=[0, envModelOutsData.shape[1]-2],
                   size=[-1, 1],
                   name="envModelPredictedDone"), \
                tf.slice(envModelPredictedOuts,
                     begin=[0, envModelOutsData.shape[1]-1],
                     size=[-1, 1],
                     name="envModelPredictedDone")

        return envModelPredictedOuts, \
                envModelPredictedObservation, \
                envModelPredictedReward, \
                envModelPredictedDone
    else:
        return random.choice(initialObservations)
            
envModelPredictedOuts, \
    envModelPredictedObservation, \
    envModelPredictedReward, \
    envModelPredictedDone = envModel(envModelIns)

envModelLoss = tf.reduce_mean(tf.square(tf.sub(envModelPredictedOuts, envModelOuts)))
print "envModelLoss:", envModelLoss

envModelPredictedInitialObservation = envModel()
print "envModelPredictedInitialObservation:", envModelPredictedInitialObservation

envModelInitialObservationLoss = tf.reduce_mean(tf.square(tf.sub(envModelPredictedInitialObservation, envModelInitialObservationOuts)))
print "envModelInitialObservationLoss:", envModelInitialObservationLoss

In [None]:
probNotDone = tf.Variable(tf.constant(1.), trainable=False, name="probNotDone")
totalPredictedReward = tf.Variable(tf.constant(0.), trainable=False, name="totalPredictedReward")

envModelPredictedObservation = np.asarray(initialObservations)

for round in xrange(50): # 200
    action = getAction(envModelPredictedObservation)
    
    envModelPredictedOuts, envModelPredictedObservation, envModelPredictedReward, envModelPredictedDone = envModel(tf.concat(1, [envModelPredictedObservation, action]))
    
    rewardFromThisRound = tf.mul(envModelPredictedReward, probNotDone)
    totalPredictedReward = tf.add(totalPredictedReward, rewardFromThisRound)
    probNotDone *= 1 - envModelPredictedDone
    
agentLoss = -tf.reduce_mean(totalPredictedReward)

Now we actually are going to run and train things

In [None]:
envModelOptimizer = {}
# envModelOptimizer['AdagradOptimizer'] = tf.train.AdagradOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables"))
# envModelOptimizer['AdadeltaOptimizer'] = tf.train.AdadeltaOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables"))
# envModelOptimizer['GradientDescentOptimizer'] = tf.train.GradientDescentOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables"))
# envModelOptimizer['MomentumOptimizer'] = tf.train.MomentumOptimizer(.01, momentum=0.1).minimize(envModelLoss, var_list=tf.get_collection("envVariables"))
envModelOptimizer['AdamOptimizer'] = tf.train.AdamOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables")) #BEST
# envModelOptimizer['FtrlOptimizer'] = tf.train.FtrlOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables"))
# envModelOptimizer['RMSPropOptimizer'] = tf.train.RMSPropOptimizer(.01).minimize(envModelLoss, var_list=tf.get_collection("envVariables")) # GOOD

agentOptimizer = {}
agentOptimizer['AdagradOptimizer'] = tf.train.AdagradOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables"))
agentOptimizer['AdadeltaOptimizer'] = tf.train.AdadeltaOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables"))
agentOptimizer['GradientDescentOptimizer'] = tf.train.GradientDescentOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables"))
agentOptimizer['MomentumOptimizer'] = tf.train.MomentumOptimizer(.01, momentum=0.1).minimize(agentLoss, var_list=tf.get_collection("agentVariables"))
agentOptimizer['AdamOptimizer'] = tf.train.AdamOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables")) #BEST
agentOptimizer['FtrlOptimizer'] = tf.train.FtrlOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables"))
agentOptimizer['RMSPropOptimizer'] = tf.train.RMSPropOptimizer(.01).minimize(agentLoss, var_list=tf.get_collection("agentVariables")) # GOOD

In [None]:
tf.initialize_all_variables().run()

In [None]:
for i in xrange(3000):
    miniBatchIndices = np.random.randint(envModelInsData.shape[0], size=100)
    
    envModelOptimizer['AdamOptimizer'].run(feed_dict={
        envModelIns: envModelInsData[miniBatchIndices, :],
        envModelOuts: envModelOutsData[miniBatchIndices, :]
    })
    
    if i % 100 is 0:
        print envModelLoss.eval(feed_dict={
            envModelIns: envModelInsData,
            envModelOuts: envModelOutsData
        })

# for i in xrange(20):
#     tf.train.GradientDescentOptimizer(0.5).minimize(envModelInitialObservationLoss, var_list=tf.get_collection("envVariables")).run(feed_dict={
#         envModelInitialObservationOuts: envModelInitialObservationData 
#     })

In [None]:
startingState = env.reset().astype(np.float32)
print "starting state:", startingState
print

envModelPredictedOuts, envModelPredictedObservation, envModelPredictedReward, envModelPredictedDone = envModel(tf.expand_dims(tf.concat(0, [startingState, [0.]]), 0))

print
print "predicted:", envModelPredictedOuts.eval()
actual = env.step(np.asarray([0]))
observation, reward, done, info = actual
print "actual:", actual
# print envModelLoss.eval(feed_dict={
#         envModelPredictedOuts: envModelPredictedOuts.eval(),
#         envModelOuts: [list(observation) + [reward] + [done]]})

tf.reduce_mean(tf.square(tf.sub(envModelPredictedOuts.eval(), [list(observation) + [reward] + [done]]))).eval()

In [None]:
for iteration in xrange(300):
    agentOptimizer['AdamOptimizer'].run()
    
    if iteration % 10 is 0:
        print agentLoss.eval()

In [None]:
saver = tf.train.Saver(envModelWs + envModelbs)
saver.save(sess, "envModelVars")

In [None]:
saver.restore(sess, "envModelVars")

In [None]:
for i_episode in range(10):
    observation = env.reset()
    observation = observation.astype(np.float32)
    
    for timestep in xrange(50):
        env.render()
        
#         action = env.action_space.sample() #choose a random action
        print observation
        action = getAction(tf.expand_dims(observation, 0)).eval()[0]
        print action
        
        observation, reward, done, info = env.step(action)
        observation = observation.astype(np.float32)
        
        if done:
            print("Episode finished after {} timesteps".format(timestep+1))
            break

In [None]:
oldObservations2 = []
actions2 = []
observations2 = []
rewards2 = []
dones2 = []
initialObservations2 = []

for i_episode in range(50):
    observation = env.reset()
    observation = observation.astype(np.float32)
    initialObservations2.append(observation)
    
    for timestep in xrange(50):
        env.render()
        
#         epsilon = 1.0
#         if random.random() < epsilon:
        action = 0#env.action_space.sample() #choose a random action
        
        oldObservation = observation
        
        observation, reward, done, info = env.step(np.asarray([0]))
        observation = observation.astype(np.float32)
        
        oldObservations2.append(oldObservation)
        actions2.append(action)
        observations2.append(observation)
        rewards2.append(reward)
        dones2.append(done)
        
        if done:
            print("Episode finished after {} timesteps".format(timestep+1))
            break