In [42]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [26]:
h, w = 3, 4
r = -0.04
gamma = 1.0

walls = (1,1),
end_states = (0,3), (1, 3)
states = [(i,j) for i in range(h) for j in range(w) if (i,j) not in walls]
print("States:", states)

States: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3)]


In [27]:
actions=defaultdict(list)
for s in states:
    if s in end_states: continue
    i,j = s
    if (i-1, j) in states: actions[s].append("U")
    if (i+1, j) in states: actions[s].append("D")
    if (i, j-1) in states: actions[s].append("L")
    if (i, j+1) in states: actions[s].append("R")

print("Actions:")
for s, a in actions.items():
    print(s, a)

Actions:
(0, 0) ['D', 'R']
(0, 1) ['L', 'R']
(0, 2) ['D', 'L', 'R']
(1, 0) ['U', 'D']
(1, 2) ['U', 'D', 'R']
(2, 0) ['U', 'R']
(2, 1) ['L', 'R']
(2, 2) ['U', 'L', 'R']
(2, 3) ['U', 'L']


In [29]:
rewards = {s:r for s in states}
rewards[end_states[0]] = 1
rewards[end_states[1]] = -1

print("Rewards:")
for s, r in rewards.items():
    print(s, r)

Rewards:
(0, 0) -0.04
(0, 1) -0.04
(0, 2) -0.04
(0, 3) 1
(1, 0) -0.04
(1, 2) -0.04
(1, 3) -1
(2, 0) -0.04
(2, 1) -0.04
(2, 2) -0.04
(2, 3) -0.04


In [50]:
def get_next_state(s, a):
    i,j = s
    if a == "U" and (i-1, j) in states: i -= 1
    elif a == "D" and (i+1, j) in states: i += 1
    elif a == "L" and (i, j-1) in states: j -= 1
    elif a == "R" and (i, j+1) in states: j += 1
    return (i,j)

In [51]:
def get_probabilistic_action(s, a):
    sp1, sp2 = s, s
    if a == "U" or a == "D":
        sp1 = get_next_state(s, "L")
        sp2 = get_next_state(s, "R")
    else:
        sp1 = get_next_state(s, "U")
        sp2 = get_next_state(s, "D")
    return sp1, sp2

In [53]:
V = np.zeros((h,w))
A = np.array([['' for _ in range(w)] for _ in range(h)])

e = 1e-3

for i in range(1, 101):
    V_old = V.copy()
    for s in states:
        if s not in end_states:
            list_values = []
            for a in actions[s]:
                sp = get_next_state(s, a)
                sp1, sp2 = get_probabilistic_action(s, a)
                list_values.append(0.8*V[sp] + 0.1*V[sp1] +0.1*V[sp2])
            V[s] = rewards[s] + gamma * max(list_values)
            A[s] = actions[s][np.argmax(list_values)]
        else:
            V[s] = rewards[s]
            A[s] = 'X'
    if np.abs(np.sum(V-V_old))<e: break

print("Iteration:", i)
Vdf = pd.DataFrame(np.around(V,3))
print(Vdf)
Adf = pd.DataFrame(A)
Adf

Iteration: 14
       0      1      2      3
0  0.812  0.868  0.918  1.000
1  0.762  0.000  0.660 -1.000
2  0.705  0.655  0.611  0.388


Unnamed: 0,0,1,2,3
0,R,R,R,X
1,U,,U,X
2,U,L,L,L
