# Gridworld

In [1]:
import numpy as np
import random

In [2]:
discount_factor = 1
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]] #row +/- 1 or column +/- 1
max_iters = 1000
gridSize = 5

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.pos_check = [0, 0] # a copy of new position
    
    def initial_state(self):
        # randomly generate an initial state
        i = random.randint(0, len(self.states)-1)
        rand_state = self.states[i]
        return rand_state
    
    def possible_states(self):
        # return the possible states
        return self.states
    
    def reward(self, current_pos, action):
        # return the reward        
        # normally, reward = 0
        reward = 0
        # if taking an action crosses the border = agent stays in same position
        if -1 in self.pos_check or self.size in self.pos_check: 
            reward = -1
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            reward = 5
        return reward
    
    # def transition_probability(self, current_pos, new_pos):
        # a function that returns the entries of the transition probability matrix?
        # eg. input current state, new state, output = 0.25...0.5...1 ... etc. ?
    
    def p_transition(self, current_pos, action):
        # return the transition probability
        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)
        self.pos_check = self.new_pos # make a copy of new pos before being overwritten below
        # if taking an action crosses the border = agent stays in same position
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]
        return self.new_pos

In [4]:
# create a grid object
grid = Gridworld(5)
grid.valueMap

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [5]:
# return a random initial state
grid.initial_state()

[2, 0]

In [6]:
# return all possible states
grid.possible_states()

[[0, 0],
 [0, 1],
 [0, 2],
 [0, 3],
 [0, 4],
 [1, 0],
 [1, 1],
 [1, 2],
 [1, 3],
 [1, 4],
 [2, 0],
 [2, 1],
 [2, 2],
 [2, 3],
 [2, 4],
 [3, 0],
 [3, 1],
 [3, 2],
 [3, 3],
 [3, 4],
 [4, 0],
 [4, 1],
 [4, 2],
 [4, 3],
 [4, 4]]

## Policy Evaluation

In [7]:
# policy evaluation
    # iterate through all 25 states. At each state, iterate through all 4 actions
    # to calculate the value of each action.
    # Replace the value map with the calculated value.

for it in range(max_iters):
    valueMap_copy = np.copy(grid.valueMap)
    # start with the first state in the state list
    for state in grid.states:
        value = 0
        # perform 4 actions per state and add the rewards (value)
        for action in actions:
            # get next position and reward
            new_position = grid.p_transition(state, action)
            reward = grid.reward(state, action)
            # calculate value: 1/4[r + gamma * value(s')]
            value += (1/len(actions))*(reward+(discount_factor*grid.valueMap[new_position[0], new_position[1]]))
        # replace the value in valueMap with the value
        valueMap_copy[state[0], state[1]] = round(value,4)
    # overwrite the original value map
    grid.valueMap = valueMap_copy
    
    # print value map
    if it in [0,1,2,9, 99, 999, max_iters-1]:
        print("Iteration {}".format(it+1))
        print(grid.valueMap)
        print("")

Iteration 1
[[-0.5  10.   -0.25  5.   -0.5 ]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.5  -0.25 -0.25 -0.25 -0.5 ]]

Iteration 2
[[ 1.6875  9.75    3.4375  5.      0.4375]
 [-0.5     2.4375 -0.0625  1.1875 -0.5   ]
 [-0.4375 -0.0625  0.     -0.0625 -0.4375]
 [-0.5    -0.125  -0.0625 -0.125  -0.5   ]
 [-0.875  -0.5    -0.4375 -0.5    -0.875 ]]

Iteration 3
[[ 2.6562  9.5     4.2812  4.9375  0.8438]
 [ 0.5469  2.2812  1.7656  1.0938 -0.0781]
 [-0.625   0.4688 -0.0625  0.1562 -0.625 ]
 [-0.7344 -0.2812 -0.1719 -0.2812 -0.7344]
 [-1.1875 -0.7344 -0.625  -0.7344 -1.1875]]

Iteration 10
[[ 4.3514  8.351   5.3083  5.5598  2.0662]
 [ 2.4122  3.7326  3.0872  2.4848  0.9595]
 [ 0.4105  1.1042  1.0883  0.5556 -0.3513]
 [-1.1554 -0.5113 -0.457  -0.745  -1.4931]
 [-2.3268 -1.7418 -1.5663 -1.8444 -2.483 ]]

Iteration 100
[[ 3.3264  7.0172  4.4173  4.8326  1.703 ]
 [ 1.5906  2.8216  2.3572  1.8962  0.5286]
 [-0.4212  0.2766  0.2489 -0.1786

## Value Iteration 

## Policy Iteration 

## Testing 

In [8]:
# get transition: starting in state [0, 0] with action [0, 1], the new state is [0, 1]
state = [0, 0]
action = [0, 1] # move right
grid = Gridworld(5)
new_position = grid.p_transition(state, action)
new_position

array([0, 1])

In [9]:
# how to solve a system of equations
a = np.array([[0.5, 0, 0], [-0.5, 1, 0], [0, -0.5, 1]])
b = np.array([0,0,1])
x = np.linalg.solve(a,b)
x

array([0., 0., 1.])