In [7]:
class GridWorld:
    def __init__(self, rows, cols, goal_state, step_reward=0, goal_reward=1):
        self.rows = rows
        self.cols = cols
        self.goal_state = goal_state
        self.step_reward = step_reward
        self.goal_reward = goal_reward
        self.actions = ['up', 'down', 'left', 'right']
        self.states = [(i, j) for i in range(rows) for j in range(cols)]

    def get_next_state(self, state, action):
        if state == self.goal_state:
            return state
        i, j = state
        if action == 'up':
            i = max(i - 1, 0)
        elif action == 'down':
            i = min(i + 1, self.rows - 1)
        elif action == 'left':
            j = max(j - 1, 0)
        elif action == 'right':
            j = min(j + 1, self.cols - 1)
        return (i, j)

    def get_reward(self, state, action, next_state):
        return self.goal_reward if next_state == self.goal_state else self.step_reward

    def transition_model(self, state, action):
        next_state = self.get_next_state(state, action)
        return [(1.0, next_state)]  # deterministic

In [9]:
def value_iteration(states, actions, transition_model, rewards, gamma=0.9, theta=1e-6):
    V = {s: 0 for s in states}
    policy = {s: None for s in states}
    while True:
        delta = 0
        for s in states:
            best_action = None
            best_value = float('-inf')
            for a in actions:
                q = sum(p * (rewards(s, a, s_) + gamma * V[s_]) for p, s_ in transition_model(s, a))
                if q > best_value:
                    best_value = q
                    best_action = a
            delta = max(delta, abs(V[s] - best_value))
            V[s] = best_value
            policy[s] = best_action
        if delta < theta:
            break
    return V, policy

In [11]:
def policy_iteration(states, actions, transition_model, rewards, gamma=0.9, theta=1e-6):
    policy = {s: actions[0] for s in states}
    V = {s: 0 for s in states}
    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for s in states:
                a = policy[s]
                v = V[s]
                V[s] = sum(p * (rewards(s, a, s_) + gamma * V[s_]) for p, s_ in transition_model(s, a))
                delta = max(delta, abs(v - V[s]))
            if delta < theta:
                break
        # Policy Improvement
        policy_stable = True
        for s in states:
            old_action = policy[s]
            best_action = max(actions, key=lambda a: sum(
                p * (rewards(s, a, s_) + gamma * V[s_]) for p, s_ in transition_model(s, a)))
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False
        if policy_stable:
            break
    return V, policy

In [15]:

env = GridWorld(rows=2, cols=2, goal_state=(1, 1), step_reward=1, goal_reward=10)
states = env.states
actions = env.actions
transition_model = env.transition_model
rewards = env.get_reward
goal = env.goal_state


V, policy = value_iteration(states, actions, transition_model, rewards)
policy[goal] = None
print("Value Iteration\nOptimal Policy:")
for s in sorted(policy):
    print(f"{s}: {policy[s]}")
print("\nState Values:")
for s in sorted(V):
    print(f"{s}: {V[s]:.2f}")


V, policy = policy_iteration(states, actions, transition_model, rewards)
policy[goal] = None
print("\nPolicy Iteration\nOptimal Policy:")
for s in sorted(policy):
    print(f"{s}: {policy[s]}")
print("\nState Values:")
for s in sorted(V):
    print(f"{s}: {V[s]:.2f}")

Value Iteration
Optimal Policy:
(0, 0): down
(0, 1): down
(1, 0): right
(1, 1): None

State Values:
(0, 0): 91.00
(0, 1): 100.00
(1, 0): 100.00
(1, 1): 100.00

Policy Iteration
Optimal Policy:
(0, 0): down
(0, 1): down
(1, 0): right
(1, 1): None

State Values:
(0, 0): 91.00
(0, 1): 100.00
(1, 0): 100.00
(1, 1): 100.00
