In [1]:
import numpy as np

In [2]:
# Define the states
states = [0, 1, 2, 3]
states

[0, 1, 2, 3]

In [3]:
# Define the actions
actions = [0, 1]
actions

[0, 1]

In [4]:
# Define the transition probabilities
P = np.zeros((len(states), len(actions), len(states)))
P[0, 0, :] = [0.5, 0.5, 0, 0]
P[0, 1, :] = [0, 1, 0, 0]
P[1, 0, :] = [0, 0, 0.5, 0.5]
P[1, 1, :] = [0, 0, 0, 1]
P[2, 0, :] = [0, 0, 1, 0]
P[2, 1, :] = [0, 0, 0.5, 0.5]
P[3, 0, :] = [0, 0, 0, 1]
P[3, 1, :] = [0, 0, 0, 1]
P

array([[[0.5, 0.5, 0. , 0. ],
        [0. , 1. , 0. , 0. ]],

       [[0. , 0. , 0.5, 0.5],
        [0. , 0. , 0. , 1. ]],

       [[0. , 0. , 1. , 0. ],
        [0. , 0. , 0.5, 0.5]],

       [[0. , 0. , 0. , 1. ],
        [0. , 0. , 0. , 1. ]]])

In [5]:
# Define the rewards
R = np.zeros((len(states), len(actions)))
R[0, 0] = 5
R[0, 1] = 10
R[1, 0] = -1
R[1, 1] = 2
R[2, 0] = 2
R[2, 1] = 4
R[3, 0] = -10
R[3, 1] = 1
R

array([[  5.,  10.],
       [ -1.,   2.],
       [  2.,   4.],
       [-10.,   1.]])

In [6]:
# Define the discount factor
gamma = 0.9

In [7]:
# Implement value iteration
values = np.zeros(len(states))
while True:
    new_values = np.copy(values)
    for i in range(len(states)):
        for j in range(len(actions)):
            new_values[i] = max(new_values[i], sum([p * (R[i, j] + gamma * values[k]) for k, p in enumerate(P[i, j, :])]))
    if np.max(np.abs(values - new_values)) < 1e-4:
        break
    values = new_values
values

array([21.2492526 , 12.4992526 , 19.99906054,  9.99944467])

In [8]:
# Find the optimal policy
policy = np.zeros(len(states), dtype=int)
for i in range(len(states)):
    max_value = -np.inf
    for j in range(len(actions)):
        value = sum([P * (R[i, j])])
value

array([[[0.5, 0.5, 0. , 0. ],
        [0. , 1. , 0. , 0. ]],

       [[0. , 0. , 0.5, 0.5],
        [0. , 0. , 0. , 1. ]],

       [[0. , 0. , 1. , 0. ],
        [0. , 0. , 0.5, 0.5]],

       [[0. , 0. , 0. , 1. ],
        [0. , 0. , 0. , 1. ]]])