In [1]:
import gym
import tensorflow as tf
import numpy as np

In [2]:
env = gym.make("CartPole-v0")

In [3]:
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
print("obs = {}, n_acts = {}".format(obs_dim, n_acts))

obs = 4, n_acts = 2


In [4]:
obs = env.reset()
obs

array([ 0.02855869, -0.03398716, -0.01294668, -0.04741072])

In [5]:
def mpl(x, sizes, activation = tf.tanh, output_activation=None):

    for size in sizes:
        x = tf.layers.dense(x,units = size, activation = activation)
    
    return tf.layers.dense(x, units = sizes[-1], activation = output_activation)


In [6]:
#first layer of NN
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
#creating NN
logit = mpl(obs_ph, sizes = [32,64]+[n_acts])
#array([[0.01685937, 0.00622761]], dtype=float32)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [7]:
max_action = tf.multinomial(logits=logit,num_samples=1)
#array([[0]])
action = tf.squeeze(max_action, axis=1)
#array([0])

Instructions for updating:
Use `tf.random.categorical` instead.


In [8]:
#loss
weights_ph = tf.placeholder(shape=(None,), dtype = tf.float32)
act_ph = tf.placeholder(shape=(None,), dtype = tf.int32)
#one_hot taken actions
action_masks = tf.one_hot(act_ph, n_acts)
#array([[0., 1.]...], dtype=float32)

#logprob(a|s)
log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logit), axis=1)
#array([-0.6952652,...], dtype=float32)
loss = -tf.reduce_mean(weights_ph* log_probs)
#9.812662

In [9]:
#optimizer
train_op = tf.train.AdamOptimizer(learning_rate = 0.01).minimize(loss)
#loss minimizes :D

In [10]:
#variables for initializing and using
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [11]:
max_action = sess.run(action, {obs_ph: obs.reshape(1,-1)})[0]
#get element from array 0 or 1

In [12]:
def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
    #returns [200. 199. ... 1.]
    return rtgs

In [13]:
def train_one_epoch(batch_size = 5000):
    
    batch_obs = []
    batch_acts = []
    batch_rets = []
    batch_lens = []
    batch_weights = [] #[[200.0, 199.0, ... 1.0, 200.0, ... 1.0]
    obs = env.reset()
    done = False
    ep_rews = []
    rendering_epoch = True
    
    while True:
        
        if rendering_epoch == True:
            pass
            #env.render() 
        batch_obs.append(obs.copy())
        
        act = sess.run(action, {obs_ph: obs.reshape(1,-1)})[0]
        
        obs, reward, done, _ = env.step(act)
        
        batch_acts.append(act)
        ep_rews.append(reward)
        
        if done:
            rendering_epoch = False
            #recording everything!
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            
            # the weight for each logprob(a_t|s_t) is reward-to-go from t
            batch_weights += list(reward_to_go(ep_rews))
            
            #reset vatiables
            obs, done, ep_rews = env.reset(), False, []
            
            if len(batch_obs)>batch_size:
                break
    
    if np.mean(batch_lens)<200:
        batch_loss, _ = sess.run([loss, train_op], {obs_ph: batch_obs,
                                             act_ph: batch_acts,
                                             weights_ph: batch_weights})
    else:
        batch_loss = 0
    
    return batch_loss, batch_rets, batch_lens

In [14]:
for i in range(500):
    batch_loss, batch_rets, batch_lens = train_one_epoch()
    print("#%i, batch_loss: %.3f, batch_rets: %.3f, batch_lens: %.3f" \
          %(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
    if batch_loss == 0:
        break


#0, batch_loss: 6.331, batch_rets: 15.075, batch_lens: 15.075
#1, batch_loss: 11.537, batch_rets: 26.781, batch_lens: 26.781
#2, batch_loss: 18.510, batch_rets: 46.229, batch_lens: 46.229
#3, batch_loss: 21.252, batch_rets: 57.563, batch_lens: 57.563
#4, batch_loss: 25.043, batch_rets: 68.320, batch_lens: 68.320
#5, batch_loss: 35.136, batch_rets: 106.562, batch_lens: 106.562
#6, batch_loss: 38.062, batch_rets: 132.737, batch_lens: 132.737
#7, batch_loss: 37.735, batch_rets: 129.923, batch_lens: 129.923
#8, batch_loss: 41.063, batch_rets: 153.121, batch_lens: 153.121
#9, batch_loss: 41.560, batch_rets: 157.344, batch_lens: 157.344
#10, batch_loss: 44.940, batch_rets: 169.233, batch_lens: 169.233
#11, batch_loss: 46.130, batch_rets: 178.483, batch_lens: 178.483
#12, batch_loss: 44.515, batch_rets: 189.037, batch_lens: 189.037
#13, batch_loss: 46.496, batch_rets: 197.462, batch_lens: 197.462
#14, batch_loss: 46.114, batch_rets: 197.538, batch_lens: 197.538
#15, batch_loss: 45.387, batch_

In [15]:
def play_episode():
    
    rewards = 0
    obs = env.reset()
    while True:
        env.render()
        act = sess.run(action, {obs_ph: obs.reshape(1,-1)})[0]
        obs, reward, is_done, _ = env.step(act)
        rewards += reward
        if is_done or rewards >200:
            break
    return rewards    

In [16]:
print(play_episode())

200.0


In [17]:
env.close()