In [19]:
import numpy as np
import matplotlib.pyplot as plt

In [20]:
def value_eval(s, v_pi, grid_size):
    maxi = -1e9

    ## move left
    if (s % 4) > 0: maxi = max(maxi, v_pi[s - 1] - 1)
    ## move right
    if ((s + 1) % 4) > 0: maxi = max(maxi, v_pi[s + 1] - 1)
    ## move up
    if s > 3    : maxi = max(maxi, v_pi[s - grid_size] - 1)
    ## move down
    if s < 12    : maxi = max(maxi, v_pi[s + grid_size] - 1)

    v_pi[s] = maxi

    return v_pi

def neighbor_values(s, v_pi):
    vals = np.zeros([4], dtype='f')

    ## left
    if (s % 4) > 0  : vals[0] = v_pi[s - 1]
    else            : vals[0] = -np.Infinity
    ## right
    if ((s + 1) % 4) > 0: vals[1] = v_pi[s + 1]
    else                : vals[1] = -np.Infinity
    ## up
    if s > 3    : vals[2] = v_pi[s - 4]
    else        : vals[2] = -np.Infinity
    ## down
    if s < 12   : vals[3] = v_pi[s + 4]
    else        : vals[3] = -np.Infinity

    return vals

def get_policy(pi):
    actions = {
        0: "L",
        1: "R",
        2: "U",
        3: "D"
    }

    policy = np.full(pi.shape[0], ['U'])
    for i in range(policy.shape[0]):
        policy[i] = actions[np.argmax(pi[i])]

    return policy.reshape([4, 4])


In [21]:
## parameters
gamma = 1.0
grid_size = 4
states = int(grid_size ** 2)

## random initial values and arbitrary initial policy
v_pi = np.random.normal(-10, 1, size=[states])
v_pi[0] = 0.0
v_pi[15] = 0.0

## random so that loop doesn't break in first iteration
old_v_pi = np.random.normal(10, 1, size=[states]) 


value_iterations = 0
epsilon = 1e-12

## value iteration
value_converged = False
while not value_converged:
    value_iterations += 1

    old_v_pi = np.copy(v_pi)
    for s in range(1, 15):  
        v_pi = value_eval(s, v_pi, grid_size)

    if (np.all(abs(old_v_pi - v_pi) <= epsilon)):
        value_converged = True

## greedy policy
pi_s = np.full([states, 4], 0.0) ## [left, right, up, down]
for s in range(1, 15):
    pi_s[s] = np.zeros([4])
    greedy_action = np.argmax(neighbor_values(s, v_pi))
    pi_s[s][greedy_action] = 1.0

print("\n\nFinal Iteration")
print("Values:")
print(v_pi.reshape([4, 4]))
print("Greedy policy")
print(get_policy(pi_s))
print("\n\nTotal value iterations taken for convergence:", value_iterations)



Final Iteration
Values:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Greedy policy
[['L' 'L' 'L' 'L']
 ['U' 'L' 'L' 'D']
 ['U' 'L' 'R' 'D']
 ['R' 'R' 'R' 'L']]


Total value iterations taken for convergence: 3
