# State Value Functions for NChain

In [1]:
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.table import Table

matplotlib.use('Agg')

In [2]:
# generate grid of environment with state values
def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, .8, .8])

    nrows, ncols = 1,image.shape[0]
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for j, val in np.ndenumerate(image):
        j = j[0]
        # add state labels
        if j == 0:
            val = str(val) + " (S)"
        elif j == (n-1):
            val = str(val) + " (L)"
        else:
            val = str(val) + " (N)"
        
        tb.add_cell(0, j, width, height, text=val,
                    loc='center', facecolor='white')
        

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(-1, i, width, height/2, text=i, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

# model the environment in step function
def my_step(state, action, n, small, large):
    assert action in [0,1]
    assert state in range(n)
    
    slip_action = not action  # agent slipped, reverse action taken
    # action
    if action:  # 'backwards': go back to the beginning, get small reward
        reward = small
        next_state = 0
    elif state < n - 1:  # 'forwards': go up along the chain
        reward = 0
        next_state = state + 1
    else:  # 'forwards': stay at the end of the chain, collect large reward
        reward = large
        next_state = state
    # slip action
    if slip_action:  # 'backwards': go back to the beginning, get small reward
        slip_reward = small
        slip_next_state = 0
    elif state < n - 1:  # 'forwards': go up along the chain
        slip_reward = 0
        slip_next_state = state + 1
    else:  # 'forwards': stay at the end of the chain, collect large reward
        slip_reward = large
        slip_next_state = state

    return next_state, reward, slip_next_state, slip_reward

In [3]:
n = 5
nA = 2
slip = 0.2
small = 2
large = 10

DISCOUNT = 0.9
ACTION_PROB = 1 - slip

desc = np.append(np.array(["S"]), np.append(np.array(['N'] * (n - 2)),np.array(["L"])))

# State Value Function V(s)

In [4]:
value = np.zeros(n)
itera = 0
while True:
    # keep iteration until convergence
    itera += 1
    new_value = np.zeros_like(value)
    for i in range(n):
        for action in range(nA):
            next_state, reward, slip_next_state, slip_reward = my_step(i, action, n, small, large)
            new_value[i] += ACTION_PROB * (reward + DISCOUNT * value[next_state])
            new_value[i] += slip * (slip_reward + DISCOUNT * value[slip_next_state])            
            new_value[i] = new_value[i] / nA
    if np.sum(np.abs(value - new_value)) < 1e-4:
        draw_image(np.round(new_value, decimals=2))
        plt.savefig('../images/NC_figure_3_2.png')
        plt.close()
        break
    value = new_value
print(value)

[2.81823692 2.87728592 3.09598592 3.90598592 6.90598592]


# Optimal Value Function

In [5]:
value = np.zeros(n)
itera = 0
while True:
    # keep iteration until convergence
    itera += 1
    new_value = np.zeros_like(value)
    for i in range(n):
        values = []
        for action in range(nA):
            next_state, reward, slip_next_state, slip_reward = my_step(i, action, n, small, large)
            val = new_value[i] + ACTION_PROB * (reward + DISCOUNT * value[next_state])
            val += new_value[i] + slip * (slip_reward + DISCOUNT * value[slip_next_state])            
            val = val / nA
            values.append(val)
        new_value[i] = np.max(values)
    if np.sum(np.abs(value - new_value)) < 1e-4:
        draw_image(np.round(new_value, decimals=2))
        plt.savefig('../images/NC_figure_3_5.png')
        plt.close()
        break
    value = new_value
print(value)

[1.45610593 1.46577726 1.57323741 2.76724304 6.76724304]
