In [11]:
import numpy as np

grid_size = (3, 4)

value_function = np.zeros(grid_size)
policy = np.full(grid_size, '', dtype=str)  

rewards = {
    (0, 3): 1,  
    (1, 3): -1,  
}

actions = [(0, 1), (0, -1), (-1, 0), (1, 0)] 

policy[0, 0] = 'R'
policy[0, 1] = 'R'
policy[0, 2] = 'R'
policy[1, 0] = 'U'
policy[1, 2] = 'U'
policy[2, 0] = 'U'
policy[2, 1] = 'R'
policy[2, 2] = 'U'
policy[2, 3] = 'L'

gamma = 0.9

num_iterations = 100

box_top = '┌───┬───┬───┬───┐'
box_middle = '├───┼───┼───┼───┤'
box_bottom = '└───┴───┴───┴───┘'

action_symbols = {
    'R': '→',
    'L': '←',
    'U': '↑',
    'D': '↓',
    '': ' ',  
}
print('The random policy is:')

for i in range(grid_size[0]):
    if i > 0:
        print(box_middle)
    else:
        print(box_top)

    for j in range(grid_size[1]):
        action = policy[i, j]
        symbol = action_symbols.get(action, ' ')
        print(f'│ {symbol} ', end='')

    print('│')
print(box_bottom)


The random policy is:
┌───┬───┬───┬───┐
│ → │ → │ → │   │
├───┼───┼───┼───┤
│ ↑ │   │ ↑ │   │
├───┼───┼───┼───┤
│ ↑ │ → │ ↑ │ ← │
└───┴───┴───┴───┘


In [12]:
import numpy as np

grid_size = (3, 4)

value_function = np.zeros(grid_size)
policy = np.full(grid_size, '', dtype=str)  

rewards = {
    (0, 3): 1,  
    (1, 3): -1,  
}

actions = [(0, 1), (0, -1), (-1, 0), (1, 0)]  

policy[0, 0] = 'R'
policy[0, 1] = 'R'
policy[0, 2] = 'R'
policy[1, 0] = 'U'
policy[1, 2] = 'U'
policy[2, 0] = 'U'
policy[2, 1] = 'R'
policy[2, 2] = 'U'
policy[2, 3] = 'L'

gamma = 0.9

#the policy iteration algorithm will perform a maximum of 100 iterations during the process of policy evaluation and policy improvement
num_iterations = 100

box_top = '┌──────┬──────┬──────┬──────┐'
box_middle = '├──────┼──────┼──────┼──────┤'
box_bottom = '└──────┴──────┴──────┴──────┘'


for _ in range(num_iterations):
    while True:
        delta = 0
        for i in range(grid_size[0]):
            for j in range(grid_size[1]):
                if (i, j) not in rewards:
                    action = policy[i, j]
                    if action == 'R':
                        next_i, next_j = i, j + 1
                    elif action == 'L':
                        next_i, next_j = i, j - 1
                    elif action == 'U':
                        next_i, next_j = i - 1, j
                    elif action == 'D':
                        next_i, next_j = i + 1, j
                    else:
                        continue  

                    reward = rewards.get((next_i, next_j), 0)
                    value_function[i, j] = reward + gamma * value_function[next_i, next_j]
                    delta = max(delta, abs(reward + gamma * value_function[next_i, next_j] - value_function[i, j]))
        if delta < 0.3:
            break

policy_stable = True
for i in range(grid_size[0]):
    for j in range(grid_size[1]):
        old_action = policy[i, j]
        action_values = []
        for action_index, action in enumerate(actions):
            new_i, new_j = i + action[0], j + action[1]
            if 0 <= new_i < grid_size[0] and 0 <= new_j < grid_size[1]:
                action_value = rewards.get((new_i, new_j), 0) + gamma * value_function[new_i, new_j]
            else:
                action_value = rewards.get((i, j), 0)
            action_values.append(action_value)
        
        policy[i, j] = ['R', 'L', 'U', 'D'][np.argmax(action_values)]

        if old_action != policy[i, j]:
            policy_stable = False


print("Final Value Function:")
for i in range(grid_size[0]):
    if i > 0:
        print(box_middle)
    else:
        print(box_top)

    for j in range(grid_size[1]):
        value = f'{value_function[i, j]:.2f}'
        print(f'│ {value} ', end='')

    print('│')

print(box_bottom)


Final Value Function:
┌──────┬──────┬──────┬──────┐
│ 0.81 │ 0.90 │ 1.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.73 │ 0.00 │ 0.90 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.66 │ 0.73 │ 0.81 │ 0.73 │
└──────┴──────┴──────┴──────┘


In [13]:
box_top = '┌──────┬──────┬──────┬──────┐'
box_middle = '├──────┼──────┼──────┼──────┤'
box_bottom = '└──────┴──────┴──────┴──────┘'

for iteration in range(num_iterations):
    print(f"Iteration - {iteration}")
    
    # Display the current value function
    print("Value Function:")
    for i in range(grid_size[0]):
        if i > 0:
            print(box_middle)
        else:
            print(box_top)

        for j in range(grid_size[1]):
            value = f'{value_function[i, j]:.2f}'
            print(f'│ {value} ', end='')

        print('│')

    print(box_bottom)
    
    # Policy Evaluation
    delta_grid = np.zeros(grid_size)
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            if (i, j) not in rewards:
                action = policy[i, j]
                if action == 'R':
                    next_i, next_j = i, j + 1
                elif action == 'L':
                    next_i, next_j = i, j - 1
                elif action == 'U':
                    next_i, next_j = i - 1, j
                elif action == 'D':
                    next_i, next_j = i + 1, j
                else:
                    continue  # Skip empty action

                reward = rewards.get((next_i, next_j), 0)
                new_value = reward + gamma * value_function[next_i, next_j]
                delta_grid[i, j] = abs(new_value - value_function[i, j])
                value_function[i, j] = new_value
    
    # Display the delta grid
    print("Delta Grid:")
    for i in range(grid_size[0]):
        if i > 0:
            print(box_middle)
        else:
            print(box_top)

        for j in range(grid_size[1]):
            delta = f'{delta_grid[i, j]:.2f}'
            print(f'│ {delta} ', end='')

        print('│')

    print(box_bottom)
    
    # Policy Improvement
    policy_stable = True
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            old_action = policy[i, j]
            action_values = []
            for action_index, action in enumerate(actions):
                new_i, new_j = i + action[0], j + action[1]
                if 0 <= new_i < grid_size[0] and 0 <= new_j < grid_size[1]:
                    action_value = rewards.get((new_i, new_j), 0) + gamma * value_function[new_i, new_j]
                else:
                    action_value = rewards.get((i, j), 0)
                action_values.append(action_value)

            # Assign the string representation of the selected action
            policy[i, j] = ['R', 'L', 'U', 'D'][np.argmax(action_values)]

            if old_action != policy[i, j]:
                policy_stable = False

    # Check if the policy has stabilized
    if policy_stable:
        break

# Display the final value function in a grid
print("Final Value Function:")
for i in range(grid_size[0]):
    if i > 0:
        print(box_middle)
    else:
        print(box_top)

    for j in range(grid_size[1]):
        value = f'{value_function[i, j]:.2f}'
        print(f'│ {value} ', end='')

    print('│')

print(box_bottom)

Iteration - 0
Value Function:
┌──────┬──────┬──────┬──────┐
│ 0.81 │ 0.90 │ 1.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.73 │ 0.00 │ 0.90 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.66 │ 0.73 │ 0.81 │ 0.73 │
└──────┴──────┴──────┴──────┘
Delta Grid:
┌──────┬──────┬──────┬──────┐
│ 0.00 │ 0.00 │ 0.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.00 │ 0.81 │ 0.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.00 │ 0.00 │ 0.00 │ 0.00 │
└──────┴──────┴──────┴──────┘
Iteration - 1
Value Function:
┌──────┬──────┬──────┬──────┐
│ 0.81 │ 0.90 │ 1.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.73 │ 0.81 │ 0.90 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.66 │ 0.73 │ 0.81 │ 0.73 │
└──────┴──────┴──────┴──────┘
Delta Grid:
┌──────┬──────┬──────┬──────┐
│ 0.00 │ 0.00 │ 0.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.00 │ 0.00 │ 0.00 │ 0.00 │
├──────┼──────┼──────┼──────┤
│ 0.00 │ 0.00 │ 0.00 │ 0.00 │
└──────┴──────┴──────┴──────┘
Final Value Function:
┌──────┬──────┬──────┬──────┐
│ 0.81 │ 0.90 │ 1.00 │ 0