In [1]:
from collections import namedtuple
from copy import deepcopy

Cell = namedtuple("Cell", ["utility", "direction"])
Grid = list[list[Cell]]

step_cost = -0.04
discount_factor = 0.95
initial_grid: Grid = [
    [Cell(0, None), Cell(-1, None), Cell(1, None)],
    [Cell(0, None), Cell(0, None), Cell(0, None)],
    [Cell(0, None), Cell(None, None), Cell(0, None)],
    [Cell(0, None), Cell(0, None), Cell(0, None)],
]
penalty_pos = (0, 1)
reward_pos = (0, 2)

In [17]:
def iterate(grid: Grid, p: float) -> Grid:
    new_grid = deepcopy(grid)

    def is_valid(x: int, y: int) -> bool:
        if x < 0 or x >= len(grid) or y < 0 or y >= len(grid[0]):
            return False
        return grid[x][y].utility is not None

    def perpendicular(dx: int, dy: int) -> list[tuple[int, int]]:
        if abs(dx) == 1 and dy == 0:
            return [(0, 1), (0, -1)]
        elif dx == 0 and abs(dy) == 1:
            return [(1, 0), (-1, 0)]
        else:
            raise ValueError("Invalid Direction")

    directions: list[tuple[int, int]] = [(0, 1), (1, 0), (0, -1), (-1, 0)]

    for x, row in enumerate(grid):
        for y, cell in enumerate(row):
            if not is_valid(x, y) or penalty_pos == (x, y) or reward_pos == (x, y):
                continue

            def find_utility(nx: int, ny: int) -> float:
                return grid[nx][ny].utility if is_valid(nx, ny) else cell.utility

            max_util = -99999
            direction = None
            for dx, dy in directions:
                expected_utility = p * find_utility(x + dx, y + dy) + sum(
                    (1 - p) / 2 * find_utility(x + ddx, y + ddy)
                    for ddx, ddy in perpendicular(dx, dy)
                )
                if (new_max_util := step_cost + discount_factor * expected_utility) > max_util:
                    max_util = new_max_util
                    match (dx, dy):
                        case (0, 1):
                            direction="right"
                        case (0, -1):
                            direction="left"
                        case (1, 0):
                            direction="down"
                        case (-1, 0):
                            direction="up"
            new_grid[x][y] = Cell(max_util, direction)

    return new_grid

In [18]:
def print_cell(grid: Grid, directions: bool = False) -> None:
    for row in grid:
        for cell in row:
            print(cell.direction if directions else round(cell.utility, 4) if cell.utility is not None else cell.utility, end="\t")
        print()
    print()

def converge(prev: Grid, cur: Grid) -> bool:
    for row1, row2 in zip(prev, cur):
        for cell1, cell2 in zip(row1, row2):
            if cell1.utility is None and cell2.utility is None:
                continue
            if abs(cell1.utility - cell2.utility) > 0.0001:
                return False
    return True

In [19]:
# Task A
prev = deepcopy(initial_grid)
cur = iterate(initial_grid, 0.7)
i = 1
while not converge(prev, cur):
    print(f"Iteration {i}")
    i += 1
    prev = cur
    cur = iterate(prev, 0.7)
    print_cell(prev)

Iteration 1
-0.04	-1	1	
-0.04	-0.04	0.625	
-0.04	None	-0.04	
-0.04	-0.04	-0.04	

Iteration 2
-0.078	-1	1	
-0.078	0.2274	0.7084	
-0.078	None	0.3642	
-0.078	-0.078	-0.078	

Iteration 3
-0.1141	-1	1	
0.089	0.321	0.7583	
-0.1141	None	0.5349	
-0.1141	-0.1141	0.18	

Iteration 4
-0.1195	-1	1	
0.1409	0.3675	0.7788	
-0.0133	None	0.6167	
-0.1484	0.0472	0.3251	

Iteration 5
-0.1058	-1	1	
0.1855	0.3878	0.7884	
0.0499	None	0.6537	
-0.0317	0.1896	0.4232	

Iteration 6
-0.0742	-1	1	
0.2099	0.397	0.7926	
0.0976	None	0.6706	
0.0887	0.2955	0.482	

Iteration 7
-0.0535	-1	1	
0.2273	0.4012	0.7945	
0.1274	None	0.6782	
0.183	0.3647	0.5167	

Iteration 8
-0.0389	-1	1	
0.2373	0.403	0.7954	
0.1475	None	0.6816	
0.2468	0.4076	0.5366	

Iteration 9
-0.0302	-1	1	
0.2435	0.4039	0.7958	
0.1662	None	0.6832	
0.2872	0.433	0.5478	

Iteration 10
-0.0249	-1	1	
0.2479	0.4042	0.7959	
0.1984	None	0.6839	
0.3125	0.4477	0.5541	

Iteration 11
-0.0212	-1	1	
0.2535	0.4044	0.796	
0.2244	None	0.6842	
0.3305	0.4561	0.5576	

Iteration 12

In [20]:
# Task B
p = 0.1
while p < 1:
    prev = deepcopy(initial_grid)
    cur = iterate(prev, p)
    while not converge(prev, cur):
        prev = cur
        cur = iterate(cur, p)
    print_cell(cur, directions=True)
    p += 0.1

left	None	None	
up	down	right	
right	None	right	
up	down	right	

left	None	None	
up	down	right	
right	None	right	
up	down	right	

left	None	None	
down	down	right	
right	None	up	
down	right	right	

left	None	None	
down	down	right	
right	None	up	
down	right	right	

left	None	None	
right	down	up	
down	None	up	
right	right	up	

left	None	None	
right	down	up	
down	None	up	
right	right	up	

down	None	None	
right	right	up	
down	None	up	
right	right	up	

down	None	None	
right	right	up	
down	None	up	
right	right	up	

down	None	None	
right	right	up	
up	None	up	
right	right	up	

down	None	None	
right	right	up	
up	None	up	
right	right	up	



# Explanation
At low values of p, the probability of going perpendicular to the direection of the action is much higher than the probability of going in the direction of the action. Thus, the directions move from the original at p=0.1 to perpendicular to that direction at p=0.9

Till p = 0.6, the best choice is to move down from the top left, as the direct right is a penalty. The action chosen is left because down is perpendicular to left and thus would be chosen with higher probability for lower p.

After p = 0.6, the action chosen is down from the top left as the probability of going in the direction of the action is much higher.

A similar argument can be used to explain the direction at all the cells.# Explanation
At low values of p, the probability of going perpendicular to the direection of the action is much higher than the probability of going in the direction of the action. Thus, the directions move from the original at p=0.1 to perpendicular to that direction at p=0.9

Till p = 0.6, the best choice is to move down from the top left, as the direct right is a penalty. The action chosen is left because down is perpendicular to left and thus would be chosen with higher probability for lower p.

After p = 0.6, the action chosen is down from the top left as the probability of going in the direction of the action is much higher.

A similar argument can be used to explain the direction at all the cells.