In [1]:
import numpy as np

def gridworld(threshold=0.1, gamma=0.9):
    height, width = 5, 5
    values = np.zeros((height, width))
    
    A = (0, 1)
    B = (0, 3)
    A_prime = (4, 1)
    B_prime = (2, 3)
    
    actions = [(-1, 0), (1, 0), (0, 1), (0, -1)]
    iteration = 0
    max_diff = float('inf')
    
    print(f"Initial values (Iteration 0):")
    print_grid(values)
    print()
    
    while max_diff > threshold:
        old_values = np.copy(values)
        
        for i in range(height):
            for j in range(width):
                if (i, j) == A:
                    values[i, j] = 10 + gamma * old_values[A_prime]
                elif (i, j) == B:
                    values[i, j] = 5 + gamma * old_values[B_prime]
                else:
                    action_values = []
                    for action in actions:
                        new_i, new_j = i + action[0], j + action[1]
                        
                        if new_i < 0 or new_i >= height or new_j < 0 or new_j >= width:
                            reward = -1
                            action_values.append(reward + gamma * old_values[i, j])
                        else:
                            reward = 0
                            action_values.append(reward + gamma * old_values[new_i, new_j])
                    
                    values[i, j] = sum(action_values) / len(action_values)
        
        max_diff = np.max(np.abs(values - old_values))
        iteration += 1
        
        print(f"Iteration {iteration} (Max diff: {max_diff:.4f}):")
        print_grid(values)
        print()
    
    print(f"Final values after {iteration} iterations (max difference: {max_diff:.4f}):")
    print_grid(values)
    return iteration

def print_grid(values):
    for i in range(values.shape[0]):
        for j in range(values.shape[1]):
            print(f"{values[i, j]:6.2f}", end=" ")
        print()

iterations = gridworld(threshold=0.1, gamma=0.9)
print(f"Converged after {iterations} iterations")

Initial values (Iteration 0):
  0.00   0.00   0.00   0.00   0.00 
  0.00   0.00   0.00   0.00   0.00 
  0.00   0.00   0.00   0.00   0.00 
  0.00   0.00   0.00   0.00   0.00 
  0.00   0.00   0.00   0.00   0.00 

Iteration 1 (Max diff: 10.0000):
 -0.50  10.00  -0.25   5.00  -0.50 
 -0.25   0.00   0.00   0.00  -0.25 
 -0.25   0.00   0.00   0.00  -0.25 
 -0.25   0.00   0.00   0.00  -0.25 
 -0.50  -0.25  -0.25  -0.25  -0.50 

Iteration 2 (Max diff: 3.3188):
  1.47   9.78   3.07   5.00   0.34 
 -0.48   2.19  -0.06   1.07  -0.48 
 -0.42  -0.06   0.00  -0.06  -0.42 
 -0.48  -0.11  -0.06  -0.11  -0.48 
 -0.84  -0.48  -0.42  -0.48  -0.84 

Iteration 3 (Max diff: 1.4808):
  2.25   9.57   3.75   4.95   0.67 
  0.37   2.07   1.42   0.99  -0.13 
 -0.57   0.37  -0.05   0.12  -0.57 
 -0.66  -0.24  -0.14  -0.24  -0.66 
 -1.09  -0.66  -0.57  -0.66  -1.09 

Iteration 4 (Max diff: 0.5752):
  2.75   9.40   4.18   5.11   0.89 
  0.68   2.64   1.52   1.43  -0.03 
 -0.36   0.27   0.40   0.03  -0.53 
 -0.83  -