In [2]:
import numpy as np

In [4]:
# Markov Decision Process

# Initial Conditions
h, w = 3, 4
gamma = 1
r = -0.04

# Enviroment, actions and rewards
walls = [(1,1)]
states = [(i,j) for i in range(h) for j in range(w) if (i,j) not in walls]
rewards = {s:r for s in states}

rewards[(0,3)] = 1
rewards[(1,3)] = -1

end_actions = [(0,3),(1,3)]

actions = {}
for s in states:
  if s not in end_actions:
    i,j = s
    possibles = []
    if i>0 and (i-1,j) not in walls: possibles.append('U')
    if j<w-1 and (i,j+1) not in walls: possibles.append('R')
    if i<h-1 and (i+1,j) not in walls: possibles.append('D')
    if j>0 and (i,j-1) not in walls: possibles.append('L')
    actions[(i,j)]=possibles
    
print("States:", states)
print('\nRewards:')
for s, r in rewards.items():
  print(f'{s}: {r}')
print()
print('Actions:')
for s, a in actions.items():
  print(f'{s}: {a}')

States: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3)]

Rewards:
(0, 0): -0.04
(0, 1): -0.04
(0, 2): -0.04
(0, 3): 1
(1, 0): -0.04
(1, 2): -0.04
(1, 3): -1
(2, 0): -0.04
(2, 1): -0.04
(2, 2): -0.04
(2, 3): -0.04

Actions:
(0, 0): ['R', 'D']
(0, 1): ['R', 'L']
(0, 2): ['R', 'D', 'L']
(1, 0): ['U', 'D']
(1, 2): ['U', 'R', 'D']
(2, 0): ['U', 'R']
(2, 1): ['R', 'L']
(2, 2): ['U', 'R', 'L']
(2, 3): ['U', 'L']


In [5]:
def next_state(s, action):
  i, j = s
  if action   == 'U': i -= 1
  elif action == 'R': j += 1
  elif action == 'D': i += 1
  elif action == 'L': j -= 1
  return i,j

In [6]:
def next_stochastic_state(s, action, actions):
  i1, j1 = s
  i2, j2 = s
  if action == 'U' or action == 'D':
    if 'L' in actions[s]: j1 -= 1
    if 'R' in actions[s]: j2 += 1
  else:
    if 'U' in actions[s]: i1 -= 1
    if 'D' in actions[s]: i2 += 1
  return (i1,j1), (i2,j2)

In [21]:
# Value Iteration Algorithm
e = 1e-3
min_change = np.inf
episodes = 0

# Initial Conditions
V = np.array([[0. for j in range(w)] for i in range(h)])
A = np.array([['' for j in range(w)] for i in range(h)])
p = 0.8

while True:
  episodes += 1
  old_V = V.copy()
  d = 0
  for s in rewards:
    if s not in end_actions:
      list_of_values, list_of_actions = [], []
      for action in actions[s]:
        i,j = next_state(s, action)
        value = p * (rewards[s] + gamma * V[i,j])
        
        (i1, j1), (i2, j2) = next_stochastic_state(s, action, actions)
        value += (1-p)/2 * (rewards[s] + gamma * V[i1,j1])
        value += (1-p)/2 * (rewards[s] + gamma * V[i2,j2])
        
        list_of_values.append(value)
        list_of_actions.append(action)
      V[s] = np.max(list_of_values)
      A[s] = list_of_actions[np.argmax(list_of_values)]
    else:
      V[s] = rewards[s]
      A[s] = ''
    if np.abs(V[s]-old_V[s]) > d: d = np.around(np.abs(V[s]-old_V[s]),3)
  if d<=e*(1-gamma)/gamma: break
  
print('\n', np.around(V, 3))
print('\n', A)
print('episodes: ', episodes)


 [[ 0.812  0.868  0.918  1.   ]
 [ 0.762  0.     0.66  -1.   ]
 [ 0.705  0.655  0.611  0.388]]

 [['R' 'R' 'R' '']
 ['U' '' 'U' '']
 ['U' 'L' 'L' 'L']]
episodes:  14
