In [None]:
import numpy as np

In [2]:
def policy_eval(pi_s, gamma, n):
    A = np.zeros([n, n], dtype='f')
    B = np.zeros([n, 1], dtype='f')

    for s in range(n):
        ## We add coefficients to the corresponding equation, refering to each of the four actions
        A[s][s] = 1.0
        if s == 1:
            ## special case 1
            B[s][0] += 10.0
            A[s][21] += -gamma
            
        elif s == 3:
            ## special case 2
            B[s][0] += 5.0
            A[s][13] += -gamma

        else:
            ## left
            if (s % 5) == 0:
                ## can't go left
                B[s][0] += -pi_s[s][0]
                A[s][s] += -gamma * pi_s[s][0]
            else:
                ## go left
                A[s][s - 1] += -gamma * pi_s[s][0]

            ## right
            if ((s + 1) % 5) == 0:
                ## can't go right
                B[s][0] += -pi_s[s][1]
                A[s][s] += -gamma * pi_s[s][1]
            else:
                ## go right
                A[s][s + 1] += -gamma * pi_s[s][1]
            
            ## up
            if s < 5:
                ## can't go up
                B[s][0] += -pi_s[s][2]
                A[s][s] += -gamma * pi_s[s][2]
            else:
                ## go up
                A[s][s - 5] += -gamma * pi_s[s][2]

            ## down
            if s > 19:
                ## can't go down
                B[s][0] += -pi_s[s][3]
                A[s][s] += -gamma * pi_s[s][3]
            else:
                ## go down
                A[s][s + 5] += -gamma * pi_s[s][3]

    # print(A)
    v = np.dot(np.linalg.inv(A), B)
    return v.flatten()

def neighbor_values(s, v_pi):
    vals = np.zeros([4], dtype='f')

    ## left
    if (s % 5) > 0  : vals[0] = v_pi[s - 1]
    else            : vals[0] = -np.Infinity
    ## right
    if ((s + 1) % 5) > 0: vals[1] = v_pi[s + 1]
    else                : vals[1] = -np.Infinity
    ## up
    if s > 4    : vals[2] = v_pi[s - 5]
    else        : vals[2] = -np.Infinity
    ## down
    if s < 20   : vals[3] = v_pi[s + 5]
    else        : vals[3] = -np.Infinity

    return vals

def get_policy(pi):
    actions = {
        0: "L",
        1: "R",
        2: "U",
        3: "D"
    }

    policy = np.full(pi.shape[0], ['U'])
    for i in range(policy.shape[0]):
        policy[i] = actions[np.argmax(pi[i])]

    return policy.reshape([5, 5])


In [3]:
## parameters
gamma = 0.9
states = 25

## random initial values and policy
v_pi = np.random.normal(2, 1, size=[states])
pi_s = np.full([states, 4], 0.25) ## [left, right, up, down]
old_v_pi = np.copy(v_pi) + 1.0

policy_iterations = 0
epsilon = 1e-12
policy_converged = False

while not policy_converged:
    if np.all(old_v_pi == v_pi): break

    policy_iterations += 1
    old_pi = np.copy(pi_s)

    ## value calculation
    v_pi = policy_eval(pi_s, gamma, states)

    ## policy iteration
    for s in range(states):
        pi_s[s] = np.zeros([4])
        greedy_action = np.argmax(neighbor_values(s, v_pi))
        pi_s[s][greedy_action] = 1.0

    ## convergence
    if np.all(abs(old_pi - pi_s) <= epsilon):
        policy_converged = True

    ## print value and policy
    # print("\n\npolicy iteration:", policy_iterations)
    # print("Values:")
    # print(np.round(v_pi.reshape([5, 5]), 1))
    # print("Greedy policy")
    # print(get_policy(pi_s))

print("\n\nFinal Iteration")
print("Values:")
print(np.round(v_pi.reshape([5, 5]), 1))
print("Greedy policy")
print(get_policy(pi_s))
print("\n\nTotal policy iterations taken for convergence:", policy_iterations)



Final Iteration
Values:
[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]
Greedy policy
[['R' 'L' 'L' 'L' 'L']
 ['R' 'U' 'L' 'L' 'L']
 ['R' 'U' 'L' 'L' 'L']
 ['R' 'U' 'L' 'L' 'L']
 ['R' 'U' 'L' 'L' 'L']]


Total policy iterations taken for convergence: 3
