In [None]:
import numpy as np

In [None]:
class GridWorld:
    def __init__(self, num_states=12, num_actions=4, gamma=0.9, noise=0.2, epsilon=0.1):
        self.num_rows = 3
        self.num_cols = 4
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma
        self.noise = noise
        self.epsilon = epsilon
        self.grid_rewards = {
            (0, 3): 1,
            (1, 3): -1
        }
        self.grid_walls = [(1, 1)]
        self.actions = [(1, 0), (0, -1), (-1, 0), (0, 1)]  # Down, Left, Up, Right

    def get_next_state(self, state, action):
        if state in self.grid_walls:
            return state
        next_state = (state[0] + action[0], state[1] + action[1])
        if (0 <= next_state[0] < self.num_rows) and (0 <= next_state[1] < self.num_cols) and (next_state not in self.grid_walls):
            return next_state
        return state

    def value_iteration(self, num_iterations=100):
        V = np.zeros((self.num_rows, self.num_cols))
        for iteration in range(num_iterations):
          V_new = np.zeros((self.num_rows, self.num_cols))
          for i in range(self.num_rows):
            for j in range(self.num_cols):
              state = (i, j)
              if state in self.grid_rewards:
                V_new[state] = self.grid_rewards[state]
                continue

              max_value = float('-inf')
              for action in self.actions:
                next_state = self.get_next_state(state, action)
                reward = self.grid_rewards.get(next_state, 0)
                next_state_value = V[next_state[0], next_state[1]]
                expected_value = (1 - self.noise) * (reward + self.gamma * next_state_value)
              if expected_value > max_value:
                max_value = expected_value
              V_new[i, j] = max_value

          max_abs_V = np.max(np.abs(V_new))
          if max_abs_V != 0:
            V_new /= max_abs_V

          V = V_new
          print(f"Iteration {iteration + 1}:")
          self.print_grid_values(V)

        policy = self.extract_policy(V)
        print("Policy after convergence:")
        self.print_grid_policy(policy)

    def print_grid_values(self, V):
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if (i, j) in self.grid_rewards:
                    print(f"| {self.grid_rewards[(i, j)]} |", end="")
                elif (i, j) in self.grid_walls:
                    print("| WALL ", end="|")
                else:
                    print(f"| {V[i, j]:.2f} |", end="")
            print()
        print()

    def extract_policy(self, V):
      policy = {}
      for i in range(self.num_rows):
        for j in range(self.num_cols):
          state = (i, j)
          if state in self.grid_walls or state in self.grid_rewards:
            continue

          max_action = None
          max_value = float('-inf')
          for action in self.actions:
            next_state = self.get_next_state(state, action)
            reward = self.grid_rewards.get(next_state, 0)
            next_state_value = V[next_state[0], next_state[1]]
            value = (1 - self.noise) * (reward + self.gamma * next_state_value) + self.noise * V[i, j]
            if value > max_value:
              max_value = value
              max_action = action

          policy[state] = max_action
      return policy


    def print_grid_policy(self, policy):
        action_symbols = {(-1, 0): 'up', (1, 0): 'down', (0, -1): 'left', (0, 1): 'right'}
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if (i, j) in self.grid_rewards:
                    print(f"| {self.grid_rewards[(i, j)]} |", end="")
                elif (i, j) in self.grid_walls:
                    print("| WALL ", end="|")
                else:
                    state = (i, j)
                    if state in policy:
                        action = policy[state]
                        action_symbol = action_symbols.get(action, ' ')
                        print(f"| {action_symbol} |", end="")
                    else:
                        print("|   |", end="")
            print()



In [None]:
grid_world = GridWorld()
grid_world.value_iteration()

Iteration 1:
| 0.00 || 0.00 || 0.80 || 1 |
| 0.00 || WALL || -0.80 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 2:
| 0.00 || 0.38 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 3:
| 0.21 || 0.57 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 4:
| 0.30 || 0.53 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 5:
| 0.29 || 0.54 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 6:
| 0.29 || 0.54 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 7:
| 0.29 || 0.54 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 8:
| 0.29 || 0.54 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 9:
| 0.29 || 0.54 || 1.00 || 1 |
| 0.00 || WALL || -1.00 || -1 |
| 0.00 || 0.00 || 0.00 || 0.00 |

Iteration 10:
| 0.2