In [1]:
import numpy as np

In [2]:
nS = 16
nA = 4
dx = [-1, 0, 1, 0]
dy = [0, 1, 0, -1]

def is_inside(i, j):
    return i >= 0 and i < 4 and j >= 0 and j < 4

In [3]:
def value_iter(theta=0.01, discount_factor=1.0):
    v = np.zeros(nS)
    while(1):
        delta = 0
        for s in range(1, nS - 1):
            sr, sc = s // 4, s % 4
            new_vals = -1e9
            for k in range(4):
                nsr, nsc = sr + dx[k], sc + dy[k]
                if(not is_inside(nsr, nsc)):
                    ns = s
                else:
                    ns = nsr * 4 + nsc
                new_vals = max(new_vals, (-1 + discount_factor * v[ns]))
            delta = max(delta, abs(new_vals - v[s]))
            v[s] = new_vals
        if(delta < theta):
            break
    
    # Generate policy for the obtained approx value func
    policy = np.zeros([nS, nA])
    for s in range(1, nS - 1):
        sr, sc = s // 4, s % 4
        best_action = -1
        best_value = -1e9
        for k in range(4):
            nsr, nsc = sr + dx[k], sc + dy[k]
            if(not is_inside(nsr, nsc)):
                ns = s
            else:
                ns = nsr * 4 + nsc
            cur_val = (-1 + discount_factor * v[ns])
            if(cur_val > best_value):
                best_value = cur_val
                best_action = k
        policy[s][best_action] = 1
    return policy, v

In [5]:
policy, v = value_iter()
print(policy, v)

[[0. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]] [ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]


In [6]:
# Test the value function
expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)