In [32]:
import gym
import tensorflow as tf
import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [2]:
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Step: %d %s" % (step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

### Open AI Gym

In [3]:
env = gym.make('CartPole-v0')

# Run a demo of the environment
obs = env.reset()
img = env.render(mode="rgb_array")
img.shape

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


(400, 600, 3)

In [4]:
env.action_space

Discrete(2)

In [5]:
action = 1
obs, reward, done, info = env.step(action)
print(obs)
print(reward)
print(done)
print(info)

[ 0.03150874  0.20358672 -0.00907854 -0.31793274]
1.0
False
{}


### Simple Basic Policy

In [6]:
def basic_policy(obs):
    angle = obs[2]
    # accelerate left if the pole is leaning left, and vice versa
    return 0 if angle < 0 else 1

In [7]:
totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000): # 1000 steps max
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
#         show_state(env, step, info)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [8]:
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(42.122, 8.542547395244583, 24.0, 72.0)

### Neural Network Policy

In [9]:
n_inputs = 4 # == env.observation_space.shape[0]
n_hidden = 4
n_outputs = 1 # probability of acclerating left

learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()

In [10]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu,
                         kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs,
                         kernel_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

# select action out of probabilities [left, right]
p_left_and_right = tf.concat(axis=1, values=[outputs, 1-outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

### REINFORCE Algorithm

In [11]:
y = 1. - tf.to_float(action)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]

gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))

training_op = optimizer.apply_gradients(grads_and_vars_feed)

In [12]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std
            for discounted_rewards in all_discounted_rewards]

In [13]:
n_iterations = 250
n_max_steps = 1000
n_games_per_update = 10 # update policy every n episodes
save_iterations = 10
discount_rate = 0.95

In [47]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        all_rewards = [] # all sequences of raw rewards for each episode
        all_gradients = [] # gradients saved at each step of each episode
        for game in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients],
                                                    feed_dict = {X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
        
        # After playing n episodes, we are ready to update our policy
        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
        feed_dict = {}
        for var_index, grad_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean(
                [reward * all_gradients[game_index][step][var_index]
                    for game_index, rewards in enumerate(all_rewards)
                    for step, reward in enumerate(rewards)],
                axis=0)
            feed_dict[grad_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, "rl_model/my_policy_net_pg.ckpt")

In [15]:
def pg_policy(sess, obs):
    return sess.run(action, feed_dict={X: obs.reshape(1, n_inputs)})[0][0]

In [17]:
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, "rl_model/my_policy_net_pg.ckpt")
    
    totals = []
    for episode in range(500):
        episode_rewards = 0
        obs = env.reset()
        for step in range(1000): # 1000 steps max
            fitted_action = pg_policy(sess, obs)
            obs, reward, done, info = env.step(fitted_action)
    #         show_state(env, step, info)
            episode_rewards += reward
            if done:
                break
        totals.append(episode_rewards)
    
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))

INFO:tensorflow:Restoring parameters from rl_model/my_policy_net_pg.ckpt
184.298 23.88809737086652 33.0 200.0


### Markov Decision Process

In [18]:
nan = np.nan

In [21]:
T = np.array([ # shape=[s, a, s']
    [[.7, .3, .0], [1., .0, .0], [.8, .2, .0]],
    [[.0, 1., .0], [nan, nan, nan], [.0, .0, 1.]],
    [[nan, nan, nan], [.8, .1, .1], [nan, nan, nan]],
])
R = np.array([ # shape = [s, a, s']
    [[10., .0, .0], [.0, .0, .0], [.0, .0, .0]],
    [[10., .0, .0], [nan, nan, nan], [.0, .0, -50.]],
    [[nan, nan, nan], [40., .0, .0], [nan, nan, nan]],
])

possible_actions = [[0, 1, 2], [0, 2], [1]]

### Q-Value Iteration Algorithm

In [30]:
Q = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
    # zero for possible actions, -inf otherwise
    Q[state, actions] = 0.0

learning_rate = .01
discount_rate = .95
n_iterations = 100

for iteration in range(n_iterations):
    Q_prev = Q.copy()
    for s in range(3):
        for a in possible_actions[s]:
            Q[s, a] = np.sum([
                T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
                for sp in range(3) # sp is next state transitioned by action a
            ])

In [31]:
print(Q)

# Optimal actions for each state
# It varies when you change discount_rate
print(np.argmax(Q, axis=1))

[[21.88646117 20.79149867 16.854807  ]
 [ 1.10804034        -inf  1.16703135]
 [       -inf 53.8607061         -inf]]
[0 2 1]


### Q-Learning Algorithm (Off-Policy)

In [33]:
learning_rate0 = 0.05
learning_rate_decay = 0.1
n_iterations = 20000

In [34]:
s = 0 # start in state 0

Q = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
    Q[state, actions] = .0

for iteration in range(n_iterations):
    a = rnd.choice(possible_actions[s]) # choose action randomly
    sp = rnd.choice(range(3), p=T[s, a]) # pick next state using T[s, a]
    reward = R[s, a, sp]
    learning_rate = learning_rate0 / (1 + iteration * learning_rate_decay)
    Q[s, a] = learning_rate * Q[s, a] + (1 - learning_rate) * (
        reward + discount_rate * np.max(Q[sp])
    )
    s = sp # move to next state

In [35]:
print(Q)
print(np.argmax(Q, axis=1))

[[132.13378789 125.52709849 125.5270137 ]
 [ 77.93009839         -inf  85.70369039]
 [        -inf 137.70415792         -inf]]
[0 2 1]
