In [1]:
import numpy as np

class Gridworld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.states = [(i, j) for i in range(self.rows) for j in range(self.cols)]
        self.actions = ['up', 'down', 'left', 'right']
        self.gamma = 0.9

    def transition_probability(self, state, action, next_state):
        return 0.00625

    def reward(self, state, action, next_state):
        return -1 if next_state != (3, 3) else 0

def policy_evaluation(gridworld, policy, theta):
    V = np.zeros((gridworld.rows, gridworld.cols))

    while True:
        delta = 0
        for i in range(gridworld.rows):
            for j in range(gridworld.cols):
                v = V[i, j]
                new_v = 0

                for a in gridworld.actions:
                    for next_i, next_j in [(i-1, j), (i+1, j), (i, j-1), (i, j+1)]:
                        next_i = max(0, min(gridworld.rows-1, next_i))
                        next_j = max(0, min(gridworld.cols-1, next_j))

                        transition_prob = gridworld.transition_probability((i, j), a, (next_i, next_j))
                        reward = gridworld.reward((i, j), a, (next_i, next_j))

                        new_v += policy[i, j, gridworld.actions.index(a)] * transition_prob * (reward + gridworld.gamma * V[next_i, next_j])

                V[i, j] = new_v
                delta = max(delta, np.abs(v - V[i, j]))

        if delta < theta:
            break

    return V

if __name__ == "__main__":
    gridworld = Gridworld()
    policy = np.ones((gridworld.rows, gridworld.cols, len(gridworld.actions))) / len(gridworld.actions)
    theta = 0.0001

    result = policy_evaluation(gridworld, policy, theta)

    print("Estimated State Values:")
    print(result)



Estimated State Values:
[[-0.02557525 -0.02557535 -0.02557535 -0.02557516]
 [-0.02557535 -0.02557541 -0.02557482 -0.02553945]
 [-0.02557535 -0.02557482 -0.02550389 -0.01921754]
 [-0.02557516 -0.02553945 -0.01921754 -0.01286086]]
