In [68]:
import numpy as np
from collections import defaultdict

In [209]:
h, w = 3, 4
r = -0.04
gamma = 1.0

walls = (1,1),
states = [(i,j) for i in range(h) for j in range(w) if (i,j) not in walls]
end_states = (0,3), (1,3)
print("States:")
for s in states:
    print(s)

States:
(0, 0)
(0, 1)
(0, 2)
(0, 3)
(1, 0)
(1, 2)
(1, 3)
(2, 0)
(2, 1)
(2, 2)
(2, 3)


In [210]:
rewards = {s: r for s in states}
rewards[end_states[0]] = 1
rewards[end_states[1]] = -1
print("Rewards:")
rewards

Rewards:


{(0, 0): -0.04,
 (0, 1): -0.04,
 (0, 2): -0.04,
 (0, 3): 1,
 (1, 0): -0.04,
 (1, 2): -0.04,
 (1, 3): -1,
 (2, 0): -0.04,
 (2, 1): -0.04,
 (2, 2): -0.04,
 (2, 3): -0.04}

In [211]:
actions = defaultdict(list)
for s in states:
    if s in end_states: continue
    i, j = s
    if i > 0 and (i-1, j) in states: actions[s].append('U')
    if i < h-1 and (i+1, j) in states: actions[s].append('D')
    if j > 0 and (i, j-1) in states: actions[s].append('L')
    if j < w-1 and (i, j+1) in states: actions[s].append('R')    
    
print("Actions:")
for s, a in actions.items():
    print(s, a)

Actions:
(0, 0) ['D', 'R']
(0, 1) ['L', 'R']
(0, 2) ['D', 'L', 'R']
(1, 0) ['U', 'D']
(1, 2) ['U', 'D', 'R']
(2, 0) ['U', 'R']
(2, 1) ['L', 'R']
(2, 2) ['U', 'L', 'R']
(2, 3) ['U', 'L']


In [212]:
def get_next_state(s, a):
    sp = s
    i, j = sp
    if a == 'U' and (i-1, j) in states: i -= 1
    if a == 'D' and (i+1, j) in states: i += 1
    if a == 'L' and (i, j-1) in states: j -= 1
    if a == 'R' and (i, j+1) in states: j += 1
    return i,j

In [213]:
def get_probabilistic_next_states(s, a):
    sp1 = s
    sp2 = s
    if a == 'U' or a == 'D':
        sp1 = get_next_state(s, 'L')
        sp2 = get_next_state(s, 'R')
    if a == 'L' or a == 'R':
        sp1 = get_next_state(s, 'U')
        sp2 = get_next_state(s, 'D')
    return sp1, sp2

In [214]:
e = 1e-3
V = np.zeros((h, w))
A = np.array([['' for _ in range(w)] for _ in range(h)])

for i in range(1, 100+1):
    V_old = V.copy()
    for s in states:
        if s not in end_states:
            list_of_values, list_of_actions = [], []
            for a in actions[s]:
                sp = get_next_state(s, a)
                sp1, sp2 = get_probabilistic_next_states(s, a)
                list_of_values.append(0.8 * V[sp] + 0.1 * V[sp1] + 0.1 * V[sp2])
            V[s] = rewards[s] + gamma * max(list_of_values)
            A[s] = actions[s][np.argmax(list_of_values)]
        else:
            V[s] = rewards[s]
            A[s] = 'X'
    change = np.sum(np.abs(V-V_old))
    if change < e: break

print(f"After {i} iterations:")
print(np.around(V,3))
print()
print(A)

After 14 iterations:
[[ 0.812  0.868  0.918  1.   ]
 [ 0.762  0.     0.66  -1.   ]
 [ 0.705  0.655  0.611  0.388]]

[['R' 'R' 'R' 'X']
 ['U' '' 'U' 'X']
 ['U' 'L' 'L' 'L']]
