###Simple grid-world solution

In [None]:
import numpy as np
import torch

class GridWorld:
    def __init__(self, width, height, start, goal, traps):
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.traps = traps
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]
        self.num_actions = len(self.actions)
        self.Q_values = torch.zeros((width, height, self.num_actions), dtype=torch.float32).cuda()

    def is_valid_location(self, x, y):
        return 0 <= x < self.width and 0 <= y < self.height

    def get_reward(self, x, y):
        if (x, y) == self.goal:
            return 100
        elif (x, y) in self.traps:
            return -100
        else:
            return -1

    def get_next_state(self, x, y, action):
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if self.is_valid_location(new_x, new_y):
            return new_x, new_y
        else:
            return x, y

    def q_learning(self, num_episodes, alpha, gamma, epsilon):
        for episode in range(num_episodes):
            x, y = self.start
            while (x, y) != self.goal:
                if np.random.uniform(0, 1) < epsilon:
                    action = np.random.randint(self.num_actions)
                else:
                    action = torch.argmax(self.Q_values[x, y]).item()
                next_x, next_y = self.get_next_state(x, y, action)
                reward = self.get_reward(next_x, next_y)
                next_action_value = torch.max(self.Q_values[next_x, next_y])
                self.Q_values[x, y, action] += alpha * (reward + gamma * next_action_value - self.Q_values[x, y, action])
                x, y = next_x, next_y
            print(f"Iteration {episode + 1}:")
            self.print_board()

    def print_board(self):
      board = np.zeros((self.width, self.height), dtype=str)
      q_values = np.zeros((self.width, self.height, self.num_actions), dtype=float)
      for x in range(self.width):
          for y in range(self.height):
              if (x, y) == self.goal:
                  board[x, y] = 'G'
              elif (x, y) in self.traps:
                  board[x, y] = 'T'
              else:
                  for action in range(self.num_actions):
                      q_values[x, y, action] = self.Q_values[x, y, action]
                  board[x, y] = '.'
      print("Q-values:")
      print(q_values)
      print("Grid:")
      print(board)
    def print_optimal_board(self):
      optimal_board = np.zeros((self.width, self.height), dtype=str)
      q_values_cpu = self.Q_values.cpu().numpy()
      for x in range(self.width):
          for y in range(self.height):
              if (x, y) == self.goal:
                  optimal_board[x, y] = 'G'
              elif (x, y) in self.traps:
                  optimal_board[x, y] = 'T'
              else:
                  optimal_action = np.argmax(q_values_cpu[x, y])
                  if optimal_action == 0:
                      optimal_board[x, y] = '→'
                  elif optimal_action == 1:
                      optimal_board[x, y] = '←'
                  elif optimal_action == 2:
                      optimal_board[x, y] = '↓'
                  elif optimal_action == 3:
                      optimal_board[x, y] = '↑'
      print("Optimal Board:")
      print(optimal_board)

# Example usage:
width, height = 4, 4
start = (0, 0)
goal = (3, 3)
traps = [(0, 3), (1, 1), (3, 1), (3, 2)]
grid_world = GridWorld(width, height, start, goal, traps)
num_episodes = 5
alpha = 0.1
gamma = 0.9
epsilon = 0.1
grid_world.q_learning(num_episodes, alpha, gamma, epsilon)
grid_world.print_optimal_board()

Iteration 1:
Q-values:
[[[ -0.1  -0.1  -0.1   0. ]
  [ -0.1  -0.1   0.    0. ]
  [-10.   -0.1   0.    0. ]
  [  0.    0.    0.    0. ]]

 [[-10.   -0.1  -0.1   0. ]
  [  0.    0.    0.    0. ]
  [ -0.1 -10.    0.    0. ]
  [ -0.1  -0.1   0.    0. ]]

 [[ -0.1  -0.1  -0.1   0. ]
  [ -0.1  -0.1   0.    0. ]
  [ -0.1  -0.1   0.    0. ]
  [ -0.1  -0.1   0.    0. ]]

 [[-10.    0.    0.    0. ]
  [  0.    0.    0.    0. ]
  [  0.    0.    0.    0. ]
  [  0.    0.    0.    0. ]]]
Grid:
[['.' '.' '.' 'T']
 ['.' 'T' '.' '.']
 ['.' '.' '.' '.']
 ['.' 'T' 'T' 'G']]
Iteration 2:
Q-values:
[[[ -0.19        -0.1         -0.1         -0.1       ]
  [ -0.19        -0.1        -10.          -0.1       ]
  [-10.          -0.1         -0.1          0.        ]
  [  0.           0.           0.           0.        ]]

 [[-10.          -0.1         -0.1          0.        ]
  [  0.           0.           0.           0.        ]
  [ -0.1        -10.          -0.1          0.        ]
  [ -0.1         -0.1

In [None]:
!pip install stable_baselines3

Collecting stable_baselines3
  Downloading stable_baselines3-2.2.1-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable_baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable_baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable_baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable_baselines3-2.2.1


In [None]:
!pip install shimmy

Collecting shimmy
  Downloading Shimmy-1.3.0-py3-none-any.whl (37 kB)
Installing collected packages: shimmy
Successfully installed shimmy-1.3.0


###Grid world solution using environment

In [None]:
import gym
from stable_baselines3 import PPO

In [None]:
import numpy as np
import gym
from stable_baselines3 import PPO

class GridWorld(gym.Env):
    def __init__(self, width, height, start, goal, traps):
        super(GridWorld, self).__init__()
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.traps = traps
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up
        self.num_actions = len(self.actions)
        self.observation_space = gym.spaces.Box(low=np.array([0, 0]), high=np.array([width, height]), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.state = self.start

    def is_valid_location(self, x, y):
        return 0 <= x < self.width and 0 <= y < self.height

    def get_reward(self, x, y):
        if (x, y) == self.goal:
            return 100
        elif (x, y) in self.traps:
            return -100
        else:
            return -1

    def get_next_state(self, x, y, action):
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if self.is_valid_location(new_x, new_y):
            return new_x, new_y
        else:
            return x, y

    def reset(self):
        # Reset the environment to the start state
        self.state = self.start
        return np.array(self.state)

    def step(self, action):
        x, y = self.state
        next_x, next_y = self.get_next_state(x, y, action)
        reward = self.get_reward(next_x, next_y)
        done = (next_x, next_y) == self.goal
        self.state = (next_x, next_y)
        return np.array(self.state), reward, done, {}

    def render(self, mode='human'):
        # Render the environment (optional)
        pass

# Create an instance of the GridWorld environment
width, height = 5, 5
start = (0, 0)
goal = (4, 4)
traps = [(3, 4)]
env = GridWorld(width, height, start, goal, traps)

# Create and train a PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_gridworld")


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -167     |
| time/              |          |
|    fps             | 575      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 69.2        |
|    ep_rew_mean          | -77.4       |
| time/                   |             |
|    fps                  | 479         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010341149 |
|    clip_fraction        | 0.0899      |
|    clip_range           | 0.2         |
|    entropy_loss  

###implementation of grid world for the purpose of printing iterated progress

In [None]:
import numpy as np
import gym
from stable_baselines3 import PPO

class GridWorld(gym.Env):
    def __init__(self, width, height, start, goal, traps):
        super(GridWorld, self).__init__()
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.traps = traps
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up
        self.num_actions = len(self.actions)
        self.observation_space = gym.spaces.Box(low=np.array([0, 0]), high=np.array([width, height]), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.state = self.start
        self.iteration = 0

    def is_valid_location(self, x, y):
        return 0 <= x < self.width and 0 <= y < self.height

    def get_reward(self, x, y):
        if (x, y) == self.goal:
            return 100
        elif (x, y) in self.traps:
            return -100
        else:
            return -1

    def get_next_state(self, x, y, action):
        print(self.render())
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if self.is_valid_location(new_x, new_y):
            return new_x, new_y
        else:
            return x, y

    def reset(self):
        # Reset the environment to the start state
        self.state = self.start
        return np.array(self.state)

    def step(self, action):
        x, y = self.state
        next_x, next_y = self.get_next_state(x, y, action)
        reward = self.get_reward(next_x, next_y)
        done = (next_x, next_y) == self.goal
        self.state = (next_x, next_y)
        return np.array(self.state), reward, done, {}

    def render(self, mode='human'):
        # Render the environment (optional)
        grid = np.zeros((self.height, self.width), dtype=int)
        grid[self.goal[1]][self.goal[0]] = 2  # goal
        for trap in self.traps:
            grid[trap[1]][trap[0]] = -1  # traps
        grid[self.state[1]][self.state[0]] = 1  # agent
        print("Iteration:", self.iteration)
        print(grid)

# Create an instance of the GridWorld environment
width, height = 5, 5
start = (0, 0)
goal = (4, 4)
traps = [(3, 4)]
env = GridWorld(width, height, start, goal, traps)

# Create and train a PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_gridworld")


  and should_run_async(code)
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0 -1  2]]
None
Iteration: 0
[[ 0  0  0  0  0]
 [ 0  0 

###Using Frozen lake environment

In [None]:
import numpy as np
import gym
from stable_baselines3 import PPO

class FrozenLake(gym.Env):
    def __init__(self, map_name='4x4'):
        super(FrozenLake, self).__init__()
        self.env = gym.make("FrozenLake-v1", map_name=map_name)
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.iteration = 0
        self.render_frequency = 5

    def reset(self):
        return self.env.reset()

    def step(self, action):
        return self.env.step(action)

    def render(self, mode='human'):
        if self.iteration % self.render_frequency == 0:
            self.env.render(mode)

# Create an instance of the FrozenLake environment
env = FrozenLake(map_name='4x4')

# Create and train a PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_frozenlake")


###Using stable baselines3

In [None]:
import numpy as np
import gym
from stable_baselines3 import PPO

class FrozenLake(gym.Env):
    def __init__(self, map_name='4x4'):
        super(FrozenLake, self).__init__()
        self.env = gym.make("FrozenLake-v1", map_name=map_name)
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.iteration = 0
        self.render_frequency = 5

    def reset(self):
        return self.env.reset()

    def step(self, action):
        return self.env.step(action)

    def render(self, mode='human'):
        if self.iteration % self.render_frequency == 0:
            print("Iteration:", self.iteration)
            self.env.render(mode)

# Create an instance of the FrozenLake environment
env = FrozenLake(map_name='4x4')

# Create and train a PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save the trained model
model.save("ppo_frozenlake")


In [2]:
!pip install stable_baselines3

  and should_run_async(code)


Collecting stable_baselines3
  Downloading stable_baselines3-2.2.1-py3-none-any.whl (181 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/181.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable_baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable_baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m59.9 MB/s[0m

###Using policy iterations

In [3]:
import numpy as np
import gym
from stable_baselines3 import PPO

class GridWorld(gym.Env):
    def __init__(self, width, height, start, goal, traps):
        super(GridWorld, self).__init__()
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.traps = traps
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up
        self.num_actions = len(self.actions)
        self.observation_space = gym.spaces.Box(low=np.array([0, 0]), high=np.array([width, height]), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.state = self.start
        self.iteration = 0

    def is_valid_location(self, x, y):
        return 0 <= x < self.width and 0 <= y < self.height

    def get_reward(self, x, y):
        if (x, y) == self.goal:
            return 100
        elif (x, y) in self.traps:
            return -100
        else:
            return -1

    def get_next_state(self, x, y, action):
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if self.is_valid_location(new_x, new_y):
            return new_x, new_y
        else:
            return x, y

    def reset(self):
        self.state = self.start
        return np.array(self.state)

    def step(self, action):
        x, y = self.state
        next_x, next_y = self.get_next_state(x, y, action)
        reward = self.get_reward(next_x, next_y)
        done = (next_x, next_y) == self.goal
        self.state = (next_x, next_y)
        return np.array(self.state), reward, done, {}

    def render(self, mode='human'):
        grid = np.zeros((self.height, self.width), dtype=int)
        grid[self.goal[1]][self.goal[0]] = 2  # goal
        for trap in self.traps:
            grid[trap[1]][trap[0]] = -1  # traps
        grid[self.state[1]][self.state[0]] = 1  # agent
        print("Iteration:", self.iteration)
        print(grid)

    def policy_evaluation(self, policy, discount_factor=0.9, theta=1e-6):
        V = np.zeros((self.width, self.height))
        while True:
            delta = 0
            for i in range(self.width):
                for j in range(self.height):
                    v = V[i, j]
                    action = policy[i, j]
                    x, y = i, j
                    next_x, next_y = self.get_next_state(x, y, action)
                    reward = self.get_reward(next_x, next_y)
                    V[i, j] = reward + discount_factor * V[next_x, next_y]
                    delta = max(delta, abs(v - V[i, j]))
            if delta < theta:
                break
        return V

    def policy_improvement(self, V, discount_factor=0.9):
        policy = np.zeros((self.width, self.height), dtype=int)
        for i in range(self.width):
            for j in range(self.height):
                max_value = -np.inf
                best_action = None
                for action in range(self.num_actions):
                    x, y = i, j
                    next_x, next_y = self.get_next_state(x, y, action)
                    reward = self.get_reward(next_x, next_y)
                    value = reward + discount_factor * V[next_x, next_y]
                    if value > max_value:
                        max_value = value
                        best_action = action
                policy[i, j] = best_action
        return policy

    def policy_iteration(self, discount_factor=0.9, theta=1e-6):
        policy = np.random.randint(0, self.num_actions, size=(self.width, self.height))
        while True:
            V = self.policy_evaluation(policy, discount_factor, theta)
            new_policy = self.policy_improvement(V, discount_factor)
            if np.array_equal(policy, new_policy):
                break
            policy = new_policy
        return policy

# Create an instance of the GridWorld environment
width, height = 5, 5
start = (0, 0)
goal = (4, 4)
traps = [(3, 4)]
env = GridWorld(width, height, start, goal, traps)

# Policy iteration
optimal_policy = env.policy_iteration()

# Display the optimal policy
print("Optimal Policy:")
print(optimal_policy)


Optimal Policy:
[[0 0 0 2 2]
 [0 0 0 2 2]
 [0 0 0 2 2]
 [0 0 0 2 2]
 [0 0 0 0 0]]


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


###Using multi-agent policy iteration

In [5]:
import numpy as np
import gym
from stable_baselines3 import PPO

class GridWorld(gym.Env):
    def __init__(self, width, height, start, goal, traps):
        super(GridWorld, self).__init__()
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.traps = traps
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]
        self.num_actions = len(self.actions)
        self.observation_space = gym.spaces.Box(low=np.array([0, 0]), high=np.array([width, height]), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(self.num_actions)
        self.iteration = 0
        self.agents = 3
        self.reset()

    def is_valid_location(self, x, y):
        return 0 <= x < self.width and 0 <= y < self.height and (x, y) not in self.traps

    def get_reward(self, x, y):
        if (x, y) == self.goal:
            return 100
        elif (x, y) in self.traps:
            return -100
        else:
            return -1

    def get_next_state(self, x, y, action):
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if self.is_valid_location(new_x, new_y):
            return new_x, new_y
        else:
            return x, y

    def reset(self):
        self.width = max(self.width, 4)  # Minimum width
        self.height = max(self.height, 4)  # Minimum height
        self.state = [self.start] * self.agents
        return np.array(self.state)

    def step(self, actions):
        rewards = []
        dones = []
        for i, action in enumerate(actions):
            x, y = self.state[i]
            next_x, next_y = self.get_next_state(x, y, action)
            reward = self.get_reward(next_x, next_y)
            done = (next_x, next_y) == self.goal
            self.state[i] = (next_x, next_y)
            rewards.append(reward)
            dones.append(done)
        return np.array(self.state), np.array(rewards), all(dones), {}

    def render(self, mode='human'):
        grid = np.zeros((self.height, self.width), dtype=int)
        grid[self.goal[1]][self.goal[0]] = 2  # goal
        for trap in self.traps:
            grid[trap[1]][trap[0]] = -1  # traps
        for agent_pos in self.state:
            grid[agent_pos[1]][agent_pos[0]] = 1  # agent
        print("Iteration:", self.iteration)
        print(grid)

    def policy_evaluation(self, policy, discount_factor=0.9, theta=1e-6):
        V = np.zeros((self.width, self.height))
        while True:
            delta = 0
            for i in range(self.width):
                for j in range(self.height):
                    v = V[i, j]
                    action = policy[i, j]
                    x, y = i, j
                    next_x, next_y = self.get_next_state(x, y, action)
                    reward = self.get_reward(next_x, next_y)
                    V[i, j] = reward + discount_factor * V[next_x, next_y]
                    delta = max(delta, abs(v - V[i, j]))
            print(V)
            if delta < theta:
                break
        return V

    def policy_improvement(self, V, discount_factor=0.9):
        policy = np.zeros((self.width, self.height), dtype=int)
        for i in range(self.width):
            for j in range(self.height):
                max_value = -np.inf
                best_action = None
                for action in range(self.num_actions):
                    x, y = i, j
                    next_x, next_y = self.get_next_state(x, y, action)
                    reward = self.get_reward(next_x, next_y)
                    value = reward + discount_factor * V[next_x, next_y]
                    if value > max_value:
                        max_value = value
                        best_action = action
                policy[i, j] = best_action
        return policy

    def policy_iteration(self, discount_factor=0.9, theta=1e-6):
        policy = np.random.randint(0, self.num_actions, size=(self.width, self.height))
        while True:
            V = self.policy_evaluation(policy, discount_factor, theta)
            new_policy = self.policy_improvement(V, discount_factor)
            if np.array_equal(policy, new_policy):
                break
            policy = new_policy
        return policy

# Create an instance of the GridWorld environment
width, height = 4, 4  # Initial size
start = (0, 0)
goal = (width - 1, height - 1)
traps = [(1, 1), (2, 2)]  # Obstructed corners
env = GridWorld(width, height, start, goal, traps)

# Policy iteration
optimal_policy = env.policy_iteration()

# Display the optimal policy
print("Optimal Policy:")
print(optimal_policy)


  and should_run_async(code)
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


[[ -1.    -1.    -1.    -1.9 ]
 [ -1.9   -1.    -1.    -2.71]
 [ -1.    -1.    -1.9   -1.  ]
 [ -1.9   -1.    -1.   100.  ]]
[[ -1.9     -1.9     -2.71    -3.439 ]
 [ -2.71    -1.9     -1.9     -4.0951]
 [ -1.9     -1.9     -2.71    -1.9   ]
 [ -2.71    -1.9     -1.9    190.    ]]
[[ -2.71      -3.439     -4.0951    -4.68559 ]
 [ -3.439     -2.71      -2.71      -5.217031]
 [ -2.71      -2.71      -3.439     -2.71    ]
 [ -3.439     -2.71      -2.71     271.      ]]
[[ -4.0951      -4.68559     -5.217031    -5.6953279 ]
 [ -4.68559     -3.439       -3.439       -6.12579511]
 [ -3.439       -3.439       -4.0951      -3.439     ]
 [ -4.0951      -3.439       -3.439      343.9       ]]
[[ -5.217031    -5.6953279   -6.12579511  -6.5132156 ]
 [ -5.6953279   -4.0951      -4.0951      -6.86189404]
 [ -4.0951      -4.0951      -4.68559     -4.0951    ]
 [ -4.68559     -4.0951      -4.0951     409.51      ]]
[[ -6.12579511  -6.5132156   -6.86189404  -7.17570464]
 [ -6.5132156   -4.68559     -4.