**Objective:** 
Your task is to program an agent to find the optimal policy for navigating a labyrinth from a specified starting point to a goal point using the Value Iteration algorithm.

**Step 1: Familiarize with the Environment**
- Understand the structure of the `Labyrinth` class and how it represents the labyrinth environment, including walls, the starting point, and the goal.
- Familiarize yourself with how the `Agent` class is structured, and how it interacts with the labyrinth environment.

**Step 2: Implement Value Iteration**
- Create a function or method to implement the Value Iteration algorithm.
- You'll need to initialize a utility table with zeros and iteratively update the utilities of each state (i.e., each cell in the labyrinth) based on the Bellman equation.
- The stopping criterion for Value Iteration is when the maximum change in utility is less than a small threshold, say 0.01.
- Once the utilities have converged, use them to compute the optimal policy, which specifies the best action to take in each state.

**Step 3: Modify the Agent Class**
- Modify the `act` method of the `Agent` class to use the optimal policy derived from Value Iteration instead of taking random actions.
- Optionally, you can also modify the `update` method to incorporate any additional learning or updating you wish to implement.

**Step 4: Run the Simulation**
- Run the provided simulation loop, where the agent is placed in the labyrinth and must navigate to the goal.
- Observe how the agent's behavior changes as it learns the optimal policy.
- You might want to add some print statements or other logging to help visualize the agent's path through the labyrinth and how it improves over time.



In [34]:
import numpy as np
import random 
from time import sleep

class Labyrinth:
    def __init__(self, rows, cols, walls, start, goal):
        self.walls = walls
        self.grid = np.zeros((rows, cols))
        for wall in walls:
            self.grid[wall] = -1  # Assign -1 for walls
        self.start = start
        self.goal = goal
        self.current_position = start
        
        self.render()

    def reset(self):
        self.current_position = self.start
        return self.current_position
    
    def render(self, epoch=0, t=0, sleep_time=1):
        print(f'epoch: {epoch}, t: {t}')
        grid_copy = self.grid.copy()
        grid_copy[self.current_position] = 2
        grid_copy[self.goal] = 9
        display(grid_copy)
        print('-' * 50)
        sleep(sleep_time)
        
    
    def step(self, action: tuple[int, str]):
        # Assume actions are encoded as (delta_row, delta_col)
        new_position = (self.current_position[0] + action[0], self.current_position[1] + action[1])
        if self.is_valid_move(new_position):
            self.current_position = new_position
        reward = 1 if self.current_position == self.goal else 0
        return self.current_position, reward

    def is_valid_move(self, position):
        rows, cols = self.grid.shape
        return 0 <= position[0] < rows and 0 <= position[1] < cols and self.grid[position] != -1

    def done(self):
        return self.current_position == self.goal

class Action:
    def __init__(self, action, state):
        self.action = action
        self.state = state

class Agent:
    def __init__(self, walls, gamma = 0.5):
        
        self.utility_table = np.zeros(labyrinth.grid.shape)
        for wall in walls:
            self.utility_table[wall] = -1
        print('utility table:')
        display(self.utility_table)
        print('-' * 50)
        
        self.gamma = gamma
        self.actions = {
            'left': (0, -1),
            'right': (0, 1),
            'up': (-1, 0),
            'down': (1, 0)
        }

    def reset(self):
        pass  # Reset any agent state here
    
    def act(self, state):
        next_best_state = self.select_best_action(state).action
        
        #return random.choice([(0, 1), (0, -1), (1, 0), (-1, 0)])  # Random action for demonstration
        return self.actions[next_best_state]
    
    def max_a_reward(self, state: tuple[int, int]):
        return max([self.utility_table[s.state] for s in self.potential_states(state)])
    
    def select_best_action(self, state) -> Action:
        #print('gg')
        return max(self.potential_states(state), key=lambda a: self.utility_table[a.state])
        # return max(self.potential_states(state), key=self.utility_table[state])
    
    def potential_states(self, state):
        potential = []
        rows, cols = self.utility_table.shape
        
        for a in self.actions:
            s_new_y, s_new_x = self.actions[a]
            s_new_y += state[0]
            s_new_x += state[1]
            
            if 0 <= s_new_y < rows and 0 <= s_new_x < cols:
                potential.append(
                    Action(a, (s_new_y, s_new_x))
                )
            
        return potential
        
    def update(self, action, state, reward):
        self.utility_table[action] = reward + self.gamma * self.max_a_reward(state)
        
        pass  # Update any agent state here

In [40]:
# Define labyrinth
labyrinth = Labyrinth(4, 4, {(1, 1), (2, 1), (1, 2)}, (0, 0), (3, 3))
agent = Agent(walls=labyrinth.walls)

MAX_EPISODES = 1000
T = 100

for episode in range(MAX_EPISODES):
    state = labyrinth.reset()
    #print(state)
    agent.reset()
    
    for t in range(T):
        action = agent.act(state)
        state, reward = labyrinth.step(action)
        agent.update(action, state, reward)
        #labyrinth.render(epoch=episode, t=t)
        if labyrinth.done():
            print("ez win")
            break
    
    #labyrinth.render(epoch=episode)

epoch: 0, t: 0


array([[ 2.,  0.,  0.,  0.],
       [ 0., -1., -1.,  0.],
       [ 0., -1.,  0.,  0.],
       [ 0.,  0.,  0.,  9.]])

--------------------------------------------------
utility table:


array([[ 0.,  0.,  0.,  0.],
       [ 0., -1., -1.,  0.],
       [ 0., -1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

--------------------------------------------------


In [39]:
agent.utility_table

array([[ 0.,  0.,  0.,  0.],
       [ 0., -1., -1.,  0.],
       [ 0., -1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [None]:
labyrinth.grid