# Policy Iteration and Value Iteration – 4x4 Gridworld

Fixed 4×4 grid (no user input).

In [16]:
import numpy as np

N = 4
GOAL = (3, 3)
GAMMA = 0.9
EPISODES = 3

ACTIONS = ['U', 'D', 'L', 'R']
ACTION_DELTA = {
    'U': (-1, 0),
    'D': (1, 0),
    'L': (0, -1),
    'R': (0, 1)
}

In [17]:
def is_valid(i, j):
    return 0 <= i < N and 0 <= j < N


def get_valid_actions(state):
    i, j = state
    valid = []
    for a in ACTIONS:
        di, dj = ACTION_DELTA[a]
        if is_valid(i + di, j + dj):
            valid.append(a)
    return valid


def transition(state, action):
    i, j = state
    di, dj = ACTION_DELTA[action]
    ni, nj = i + di, j + dj
    if is_valid(ni, nj):
        return (ni, nj)
    return state


def reward(state):
    return 10 if state == GOAL else -1



In [18]:
def initialize_policy():
    policy = {}
    for i in range(N):
        for j in range(N):
            actions = get_valid_actions((i, j))
            p = 1 / len(actions)
            policy[(i, j)] = {a: p for a in actions}
    return policy

In [19]:

# Policy Evaluation
def policy_evaluation(policy):
    V = np.zeros((N, N))

    while True:
        delta = 0
        for i in range(N):
            for j in range(N):
                if (i, j) == GOAL:
                    continue

                v = 0
                for a, p in policy[(i, j)].items():
                    ns = transition((i, j), a)
                    v += p * (reward(ns) + GAMMA * V[ns])

                delta = max(delta, abs(v - V[i, j]))
                V[i, j] = v

        if delta < 1e-4:
            break

    return V


In [20]:

# Policy Improvement
def policy_improvement(V):
    policy = {}

    for i in range(N):
        for j in range(N):
            if (i, j) == GOAL:
                continue

            q = {}
            for a in get_valid_actions((i, j)):
                ns = transition((i, j), a)
                q[a] = reward(ns) + GAMMA * V[ns]

            max_q = max(q.values())
            best = [a for a in q if q[a] == max_q]
            p = 1 / len(best)

            policy[(i, j)] = {a: p for a in best}

    return policy

In [21]:

# Policy Iteration (3 episodes)
def policy_iteration():
    policy = initialize_policy()

    for ep in range(EPISODES):
        print(f"\n--- Policy Iteration Episode {ep+1} ---")
        V = policy_evaluation(policy)
        policy = policy_improvement(V)

    return policy, V


In [22]:

# Value Iteration
def value_iteration():
    V = np.zeros((N, N))

    for ep in range(EPISODES):
        print(f"\n--- Value Iteration Episode {ep+1} ---")
        while True:
            delta = 0
            for i in range(N):
                for j in range(N):
                    if (i, j) == GOAL:
                        continue

                    values = []
                    for a in get_valid_actions((i, j)):
                        ns = transition((i, j), a)
                        values.append(reward(ns) + GAMMA * V[ns])

                    new_v = max(values)
                    delta = max(delta, abs(new_v - V[i, j]))
                    V[i, j] = new_v

            if delta < 1e-4:
                break
    
    # Optimal policy from Value Iteration
    policy = {}
    for i in range(N):
        for j in range(N):
            if (i, j) == GOAL:
                continue

            q = {}
            for a in get_valid_actions((i, j)):
                ns = transition((i, j), a)
                q[a] = reward(ns) + GAMMA * V[ns]

            max_q = max(q.values())
            best = [a for a in q if q[a] == max_q]
            p = 1 / len(best)

            policy[(i, j)] = {a: p for a in best}
    
    return policy, V

In [23]:
# Run both algorithms
pi_policy, pi_V = policy_iteration()
vi_policy, vi_V = value_iteration()

print("\nOptimal Policy (Policy Iteration):")
print(pi_policy)

print("\nOptimal Policy (Value Iteration):")
print(vi_policy)

print("\nValue Function (Policy Iteration):")
print(pi_V)

print("\nValue Function (Value Iteration):")
print(vi_V)


--- Policy Iteration Episode 1 ---

--- Policy Iteration Episode 2 ---

--- Policy Iteration Episode 3 ---

--- Value Iteration Episode 1 ---

--- Value Iteration Episode 2 ---

--- Value Iteration Episode 3 ---

Optimal Policy (Policy Iteration):
{(0, 0): {'D': 0.5, 'R': 0.5}, (0, 1): {'D': 0.5, 'R': 0.5}, (0, 2): {'D': 0.5, 'R': 0.5}, (0, 3): {'D': 1.0}, (1, 0): {'D': 0.5, 'R': 0.5}, (1, 1): {'D': 0.5, 'R': 0.5}, (1, 2): {'D': 0.5, 'R': 0.5}, (1, 3): {'D': 1.0}, (2, 0): {'D': 0.5, 'R': 0.5}, (2, 1): {'D': 0.5, 'R': 0.5}, (2, 2): {'D': 0.5, 'R': 0.5}, (2, 3): {'D': 1.0}, (3, 0): {'R': 1.0}, (3, 1): {'R': 1.0}, (3, 2): {'R': 1.0}}

Optimal Policy (Value Iteration):
{(0, 0): {'D': 0.5, 'R': 0.5}, (0, 1): {'D': 0.5, 'R': 0.5}, (0, 2): {'D': 0.5, 'R': 0.5}, (0, 3): {'D': 1.0}, (1, 0): {'D': 0.5, 'R': 0.5}, (1, 1): {'D': 0.5, 'R': 0.5}, (1, 2): {'D': 0.5, 'R': 0.5}, (1, 3): {'D': 1.0}, (2, 0): {'D': 0.5, 'R': 0.5}, (2, 1): {'D': 0.5, 'R': 0.5}, (2, 2): {'D': 0.5, 'R': 0.5}, (2, 3): {'D': 

In [24]:
pi_policy, pi_V = policy_iteration()
vi_policy, vi_V = value_iteration()

print("\nOptimal Policy (Policy Iteration):")
print(pi_policy)

print("\nOptimal Policy (Value Iteration):")
print(vi_policy)

print("\nValue Function (Policy Iteration):")
print(pi_V)

print("\nValue Function (Value Iteration):")
print(vi_V)


--- Policy Iteration Episode 1 ---

--- Policy Iteration Episode 2 ---

--- Policy Iteration Episode 3 ---

--- Value Iteration Episode 1 ---

--- Value Iteration Episode 2 ---

--- Value Iteration Episode 3 ---

Optimal Policy (Policy Iteration):
{(0, 0): {'D': 0.5, 'R': 0.5}, (0, 1): {'D': 0.5, 'R': 0.5}, (0, 2): {'D': 0.5, 'R': 0.5}, (0, 3): {'D': 1.0}, (1, 0): {'D': 0.5, 'R': 0.5}, (1, 1): {'D': 0.5, 'R': 0.5}, (1, 2): {'D': 0.5, 'R': 0.5}, (1, 3): {'D': 1.0}, (2, 0): {'D': 0.5, 'R': 0.5}, (2, 1): {'D': 0.5, 'R': 0.5}, (2, 2): {'D': 0.5, 'R': 0.5}, (2, 3): {'D': 1.0}, (3, 0): {'R': 1.0}, (3, 1): {'R': 1.0}, (3, 2): {'R': 1.0}}

Optimal Policy (Value Iteration):
{(0, 0): {'D': 0.5, 'R': 0.5}, (0, 1): {'D': 0.5, 'R': 0.5}, (0, 2): {'D': 0.5, 'R': 0.5}, (0, 3): {'D': 1.0}, (1, 0): {'D': 0.5, 'R': 0.5}, (1, 1): {'D': 0.5, 'R': 0.5}, (1, 2): {'D': 0.5, 'R': 0.5}, (1, 3): {'D': 1.0}, (2, 0): {'D': 0.5, 'R': 0.5}, (2, 1): {'D': 0.5, 'R': 0.5}, (2, 2): {'D': 0.5, 'R': 0.5}, (2, 3): {'D': 