# Reinforcement Learning

In [3]:
# Common imports
import numpy as np
import os
import sys

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures and animations
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt

In [15]:
from PIL import Image, ImageDraw

try:
    from pyglet.gl import gl_info
    openai_cart_pole_rendering = True   # no problem, let's use OpenAI gym's rendering function
except Exception:
    openai_cart_pole_rendering = False  # probably no X server available, let's use our own rendering function

def render_cart_pole(env, obs):
    if openai_cart_pole_rendering:
        # use OpenAI gym's rendering function
        return env.render(mode="rgb_array")
    else:
        # rendering for the cart pole environment (in case OpenAI gym can't do it)
        img_w = 600
        img_h = 400
        cart_w = img_w // 12
        cart_h = img_h // 15
        pole_len = img_h // 3.5
        pole_w = img_w // 80 + 1
        x_width = 2
        max_ang = 0.2
        bg_col = (255, 255, 255)
        cart_col = 0x000000 # Blue Green Red
        pole_col = 0x669acc # Blue Green Red

        pos, vel, ang, ang_vel = obs
        img = Image.new('RGB', (img_w, img_h), bg_col)
        draw = ImageDraw.Draw(img)
        cart_x = pos * img_w // x_width + img_w // x_width
        cart_y = img_h * 95 // 100
        top_pole_x = cart_x + pole_len * np.sin(ang)
        top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)
        draw.line((0, cart_y, img_w, cart_y), fill=0)
        draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart
        draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole
        return np.array(img)

def plot_cart_pole(env, obs):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    img = render_cart_pole(env, obs)
    plt.imshow(img)
    plt.axis("off")
    plt.show()

In [24]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)

In [16]:
import gym

In [17]:
env = gym.make("CartPole-v0")

In [18]:
obs = env.reset()
obs

array([-0.03365718,  0.03153535,  0.0411582 ,  0.00254373])

In [19]:
env.action_space

Discrete(2)

In [54]:
obs = env.reset()

frames = []
for _ in range(1000):
    img = env.render(mode="rgb_array")
    frames.append(img)
    #env.render()
    env.step(env.action_space.sample())

env.close()

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m


In [12]:
action = 1  # acceleration to the right

In [15]:
obs, reward, done, info = env.step(action)
obs

array([-0.02946829,  0.19590867,  0.0243849 , -0.30499442])

In [16]:
reward

1.0

In [17]:
done  # returns True when the episode is finished

False

In [18]:
info

{}

In [19]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1  # move to the left if angle < 0 - otherwise - move to the right

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [35]:
import numpy as np

In [23]:
for stat in (np.mean, np.std, np.min, np.max):
    print(stat(totals))

42.2
8.869949267047698
24.0
72.0


## Policy Gradient

In [2]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [4]:
import gym

In [8]:
env = gym.make("CartPole-v0")

In [5]:
env.observation_space.shape

(4,)

In [10]:
n_inputs = 4  # env.observation_space.shape[0]
n_hidden = 4
n_outputs = 1  # probability of the left acceleration
initializer = tf.contrib.layers.variance_scaling_initializer()

# Build a neural network
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs, kernel_initializer=initializer)
outputs = tf.nn.sigmoid(logits)  # to return probabilities from 0 to 1

# Select an action
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)  # [np.log(0.2), np.log(0.8), n_samples=1] 
                                                                  # return 0 with p=20% and 1 with p=80%

init = tf.global_variables_initializer()

In [7]:
frames = []
n_max_steps = 1000

with tf.Session() as sess:
    init.run()
    obs = env.reset()
    for step in range(n_max_steps):
        # img = render_cart_pole(env, obs)
        # frames.append(img)
        env.render()
        action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
        obs, reward, done, info = env.step(action_val[0][0])
        if done:
            break
env.close()

Let's plot a better grapgh for our policy and train our model

In [5]:
reset_graph()

n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01


initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")

hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer, name="hidden")
logits = tf.layers.dense(hidden, n_outputs, kernel_initializer=initializer, name="logits")
outputs = tf.nn.sigmoid(logits, name="outputs")
p_left_and_right = tf.concat([outputs, 1 - outputs], axis=1)
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
grad_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grad_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for i, (grad, variable) in enumerate(grad_and_vars):
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape(), name="grad%d" % i)
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

file_writer = tf.summary.FileWriter("../tf_logs/RL_graph", tf.get_default_graph())

In [6]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + discount_rate * cumulative_rewards
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std for discounted_rewards in all_discounted_rewards]

In [12]:
discount_rewards([10, 0, -50], discount_rate=0.8)

array([-22., -40., -50.])

In [13]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [14]:
a = np.array([[2], [2], [2]])
b = np.array([[2], [6], [4]])
np.mean([a, b], axis=0)

array([[2.],
       [4.],
       [3.]])

Let's train the model:

In [15]:
env = gym.make("CartPole-v0")

n_games_per_update = 10
n_max_steps = 1000
n_iterations = 500
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        print("\rIteration: {}".format(iteration), end="")
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
        all_rewards = discount_and_normalize_rewards(all_rewards=all_rewards, discount_rate=discount_rate)
        feed_dict = {}
        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index] 
                                      for game_index, rewards in enumerate(all_rewards) 
                                          for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, "../models/RL_model/my_policy_net_pg.ckpt")

Iteration: 499

In [11]:
n_max_steps = 1000

with tf.Session() as sess:
    saver.restore(sess, "../models/RL_model/my_policy_net_pg.ckpt")
    obs = env.reset()
    for step in range(n_max_steps):
        env.render()
        action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
        obs, reward, done, info = env.step(action_val[0][0])
        if done:
            break
env.close()

INFO:tensorflow:Restoring parameters from ../models/RL_model/my_policy_net_pg.ckpt


## Markov Decision Process

**Bellman optimality equation:**

$V^*(s)=\text{max}_a\sum_{s'}T(s, a, s')[R(s, a, s') + \gamma\cdot V^*(s')]$ for all $s$

- T(s, a, s') - probability of transition from state $s$ to state $s'$ if agent selected action $a$
- R(s, a, s') - reward that agent will get when it transit from $s$ to state $s'$ by choosing action $a$
- $\gamma$ - discount rate

__Value iteration__ algorithm:

$V_{k+1}(s)\leftarrow\text{max}_a\sum_{s'}T(s, a, s')[R(s, a, s') + \gamma\cdot V_k(s')]$ for all $s$

- $V_k(s)$ - state value of $s$ on iteration $k$

__Q-value iteration__ algorithm:

$Q_{k+1}(s, a)\leftarrow\sum\limits_{s'}T(s, a, s')[R(s, a, s') + \gamma\cdot\max\limits_{a'}Q_k(s', a')]$ for all $(s, a)$

And the optimal policy:

$\pi^*(s)= \underset{a}{\text{argmax}}\phantom{.} Q^*(s, a)$

In [1]:
import numpy as np

In [123]:
nan = np.nan  # for impossible actions
T = np.array([ # shape = [s, a, s']
    [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]], 
    [[0.0, 1.0, 0.0], [nan, nan, nan], [0.0, 0.0, 1.0]], 
    [[nan, nan, nan], [0.8, 0.1, 0.1], [nan, nan, nan]]
])
R = np.array([ # shape = [s, a, s']
    [[10., 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], 
    [[0.0, 0.0, 0.0], [nan, nan, nan], [0.0, 0.0, -50.]], 
    [[nan, nan, nan], [40., 0.0, 0.0], [nan, nan, nan]]
])

possible_actions = [[0, 1, 2], [0, 2], [1]]

Q = np.full((3, 3), -np.inf)  # -inf for impossible actions
for state, actions in enumerate(possible_actions):
    Q[state, actions] = 0.0  # initialization

In [124]:
discount_rate = 0.95
n_iterations = 100

for iteration in range(n_iterations):
    Q_prev = Q.copy()
    for s in range(3):
        for a in possible_actions[s]:
            Q[s, a] = np.sum([T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp])) for sp in range(3)])
Q

array([[21.88646117, 20.79149867, 16.854807  ],
       [ 1.10804034,        -inf,  1.16703135],
       [       -inf, 53.8607061 ,        -inf]])

Best policy:

In [125]:
np.argmax(Q, axis=1)

array([0, 2, 1])

- $s_0$ - $a_0$
- $s_1$ - $a_2$
- $s_2$ - $a_1$

## Q-Learning

__Q-Learning__ algorithm:

$Q_{k+1}(s, a)\leftarrow(1-\alpha)Q_k(s, a)+\alpha(r+\gamma\cdot\max\limits_{a'}Q_k(s', a'))$

- $\alpha$ - learning rate
- $r$ - moving average of reward

In [143]:
learning_rate0 = 0.05
learning_rate_decay = 0.1
n_iterations = 20000

s = 0  # initial state = 0

Q = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
    Q[state, actions] = 0.0
    
for iteration in range(n_iterations):
    a = np.random.choice(possible_actions[s])  # randomly select an action
    sp = np.random.choice(range(3), p=T[s, a]) # randomly select the next state by using T[s, a] to stick to the probab.
    reward = R[s, a, sp]
    learning_rate = learning_rate0 / (1 + iteration * learning_rate_decay)  # decrese learning rate (because this algorithm is stochastic)
    Q[s, a] = (1 - learning_rate) * Q[s, a] + learning_rate * (reward + discount_rate * np.max(Q[sp]))
    s = sp
Q

array([[  5.2562294 ,   1.65426917,   1.29797164],
       [  0.        ,         -inf, -16.59509816],
       [        -inf,  15.28672668,         -inf]])

In [144]:
np.argmax(Q, axis=1)

array([0, 0, 1])