# Policy Evaluation 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
discount_factor = 1
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]] #row +/- 1 or column +/- 1
max_iters = 100
gridSize = 5

In [3]:
# initiate an array of zeros to represent the map
valueMap = np.zeros((gridSize, gridSize))
# create a list of tuples = [row, column]
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]

In [4]:
valueMap

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [5]:
states

[[0, 0],
 [0, 1],
 [0, 2],
 [0, 3],
 [0, 4],
 [1, 0],
 [1, 1],
 [1, 2],
 [1, 3],
 [1, 4],
 [2, 0],
 [2, 1],
 [2, 2],
 [2, 3],
 [2, 4],
 [3, 0],
 [3, 1],
 [3, 2],
 [3, 3],
 [3, 4],
 [4, 0],
 [4, 1],
 [4, 2],
 [4, 3],
 [4, 4]]

In [6]:
def transition(current_pos, action):
    
    # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
    new_pos = np.array(current_pos) + np.array(action)
    reward = 0
    
    # if taking an action crosses the border = agent stays in same position
    if -1 in new_pos or gridSize in new_pos: 
        new_pos = current_pos
        reward = -1
    
    # if in state A, transition to state A'
    if current_pos == [0, 1]:
        new_pos = [4, 1]
        reward = 10
    
    # if in state B, transition to state B'
    if current_pos == [0, 3]:
        new_pos = [2, 3]
        reward = 5

    return new_pos, reward

In [7]:
# policy evaluation
for it in range(max_iters):
    valueMap_copy = np.copy(valueMap)
    # start with the first state in the state list
    for state in states:
        weightedRewards = 0
        # perform 4 actions per state and add the rewards (weightedRewards)
        for action in actions:
            # get next position and reward
            new_position, reward = transition(state, action)
            # calculate weighted rewards: 1/4[r + gamma * value(s')]
            weightedRewards += (1/len(actions))*(reward+(discount_factor*valueMap[new_position[0], new_position[1]]))
        # replace the value in valueMap with the weighted rewards
        valueMap_copy[state[0], state[1]] = weightedRewards
    # overwrite the original value map
    valueMap = valueMap_copy
    
    if it in [0,1,2,9, 99, 999, max_iters-1]:
        print("Iteration {}".format(it+1))
        print(valueMap)
        print("")

Iteration 1
[[-0.5  10.   -0.25  5.   -0.5 ]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.5  -0.25 -0.25 -0.25 -0.5 ]]

Iteration 2
[[ 1.6875  9.75    3.4375  5.      0.4375]
 [-0.5     2.4375 -0.0625  1.1875 -0.5   ]
 [-0.4375 -0.0625  0.     -0.0625 -0.4375]
 [-0.5    -0.125  -0.0625 -0.125  -0.5   ]
 [-0.875  -0.5    -0.4375 -0.5    -0.875 ]]

Iteration 3
[[ 2.65625   9.5       4.28125   4.9375    0.84375 ]
 [ 0.546875  2.28125   1.765625  1.09375  -0.078125]
 [-0.625     0.46875  -0.0625    0.15625  -0.625   ]
 [-0.734375 -0.28125  -0.171875 -0.28125  -0.734375]
 [-1.1875   -0.734375 -0.625    -0.734375 -1.1875  ]]

Iteration 10
[[ 4.35138035  8.35102081  5.30830669  5.55983734  2.0662508 ]
 [ 2.41210365  3.73263168  3.08720779  2.48477173  0.95953941]
 [ 0.41044044  1.10420322  1.08833504  0.55556202 -0.35133934]
 [-1.15541267 -0.51126671 -0.45696068 -0.74501228 -1.49310112]
 [-2.32684231 -1.74187183 -1.56627369 -1.8443346