# Gridworld with Policy Evaluation

In [1]:
import numpy as np
import random

# display output
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
# initialize a policy: create an array of dimension (number of states by number of actions)
# for equal probability amongst all actions, divide everything by the number of actions
policy = np.ones([state_count, action_count]) / action_count

# policy at state 0 = [0, 0]
# returns a probability for each action given state
policy[0]

array([0.25, 0.25, 0.25, 0.25])

In [4]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.pos_check = [0, 0] # a copy of new position
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):
        # randomly generate an initial state
        i = random.randint(0, len(self.states)-1)
        rand_state = self.states[i]
        return rand_state
    
    def possible_states(self):
        # return the possible states
        return self.states
    
    def reward(self, current_pos, action):
        # return the reward        
        
        # take action in current pos
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            reward = 5
        return reward
    
    # def transition_probability(self, current_pos, new_pos):
        # a function that returns the entries of the transition probability matrix?
        # eg. input current state, new state, output = 0.25...0.5...1 ... etc. ?
    
    def p_transition(self, current_pos, action):
        # return the transition probability
        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)
        self.pos_check = self.new_pos # make a copy of new pos before being overwritten below

        # if taking an action crosses the border = agent stays in same position
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]
        return self.new_pos

In [5]:
# create a grid object
grid = Gridworld(5)
grid.valueMap

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [6]:
# # return a random initial state
# grid.initial_state()

In [7]:
# # return all possible states
# grid.possible_states()

## Policy Evaluation

In [8]:
# policy evaluation
    # iterate through all 25 states. At each state, iterate through all 4 actions
    # to calculate the value of each action.
    # Replace the value map with the calculated value.

theta = 0.000001
discount_factor = 0.99
iterations = 0

while True:
    
    delta = 0
    iterations+=1
    valueMap_copy = np.copy(grid.valueMap)
    
    # start with the first state in the state list
    for state_number, state in enumerate(grid.states):
        value = 0
        
        # perform 4 actions per state and add the rewards (value)
        for action_number, action in enumerate(actions):
            
            # get next position and reward
            new_position = grid.p_transition(state, action)
            reward = grid.reward(state, action)
            
            # calculate value: policy*transition_prob*[r + gamma * value(s')]
            value += policy[state_number][action_number]*grid.transition_prob*(reward+(discount_factor*grid.valueMap[new_position[0], new_position[1]]))          
            
        # replace the value in valueMap with the value
        valueMap_copy[state[0], state[1]] = value
        
        # calculate delta
        delta = max(delta, np.abs(value - grid.valueMap[state[0], state[1]]))       
        clear_output(wait=True)
        display('delta: ' + str(delta) + ' iterations: ' + str(iterations))
        
    # overwrite the original value map (after a full iteration of every state)
    grid.valueMap = valueMap_copy

    # stop when change in value function falls below a given threshold
    if delta < theta:
        break

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
# print value map to 4 decimal places
np.set_printoptions(precision=4)
grid.valueMap

array([[ 3.2175,  7.1243,  4.303 ,  4.7525,  1.5244],
       [ 1.4608,  2.7247,  2.2163,  1.7565,  0.3781],
       [-0.4905,  0.2073,  0.1704, -0.25  , -1.1211],
       [-2.1493, -1.5671, -1.4849, -1.8158, -2.5268],
       [-3.4671, -2.9047, -2.7873, -3.0748, -3.7354]])