In [58]:
import numpy as np
from enum import Enum

In [59]:
class Action(Enum):
    NORTH = (-1, 0)
    SOUTH = (1, 0)
    EAST = (0, 1)
    WEST = (0, -1)


def reward(s, a):
    if s == (0, 1):
        return 10, (4, 1)
    elif s == (0, 3):
        return 5, (2, 3)
    elif s[0] + a[0] < 0 or s[0] + a[0] > 4 or s[1] + a[1] < 0 or s[1] + a[1] > 4:
        return -1, s
    else:
        return 0, (s[0] + a[0], s[1] + a[1])

In [60]:
v = np.zeros((5, 5))
policy = [[None for _ in range(5)] for _ in range(5)]
s = (2, 2)
gamma = .9

def update_v(s, policy):
    best_action = None
    best_value = float('-inf')
    for a in Action:
        r, next_s = reward(s, a.value)
        value = r + gamma * v[next_s[0], next_s[1]]
        if value > best_value:
            best_value = value
            best_action = a
    v[s[0]][s[1]] = best_value
    policy[s[0]][s[1]] = best_action
    return v, policy

In [61]:
for _ in range(100):
    for i in range(5):
        for j in range(5):
            v, policy = update_v((i, j), policy)

![](./p2_sol.png)

In [64]:
print(np.round(v, 1))

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]


In [65]:
policy_directions = [[action.name if action else 'None' for action in row] for row in policy]
for row in policy_directions:
    print(row)

['EAST', 'NORTH', 'WEST', 'NORTH', 'WEST']
['NORTH', 'NORTH', 'NORTH', 'WEST', 'WEST']
['NORTH', 'NORTH', 'NORTH', 'NORTH', 'NORTH']
['NORTH', 'NORTH', 'NORTH', 'NORTH', 'NORTH']
['NORTH', 'NORTH', 'NORTH', 'NORTH', 'NORTH']
