In [2]:
# Simplified Grid World Value Iteration
ROWS, COLS, GAMMA, THETA = 4, 4, 0.9, 1e-4
TERMINAL = (3, 3)
ACTIONS = {'U':(-1,0),'D':(1,0),'L':(0,-1),'R':(0,1)}
ARROW = {'U':'^','D':'v','L':'<','R':'>'}

def next_state(s,a):
    if s==TERMINAL: return s
    r,c=s; dr,dc=ACTIONS[a]
    return (max(0,min(ROWS-1,r+dr)), max(0,min(COLS-1,c+dc)))

states=[(r,c) for r in range(ROWS) for c in range(COLS)]
V={s:0 for s in states}

# Value Iteration
while True:
    delta=0; newV=V.copy()
    for s in states:
        if s==TERMINAL: continue
        newV[s]=max(-1+GAMMA*V[next_state(s,a)] for a in ACTIONS)
        delta=max(delta,abs(V[s]-newV[s]))
    V=newV
    if delta<THETA: break

# Policy Extraction
policy={}
for s in states:
    if s==TERMINAL: policy[s]='T'; continue
    policy[s]=ARROW[max(ACTIONS,key=lambda a:-1+GAMMA*V[next_state(s,a)])]

# Display
print("Value Function:")
for r in range(ROWS):
    print(" ".join(f"{V[(r,c)]:6.2f}" for c in range(COLS)))
print("\nPolicy:")
for r in range(ROWS):
    print(" ".join(f"{policy[(r,c)]:>2}" for c in range(COLS)))


Value Function:
 -4.69  -4.10  -3.44  -2.71
 -4.10  -3.44  -2.71  -1.90
 -3.44  -2.71  -1.90  -1.00
 -2.71  -1.90  -1.00   0.00

Policy:
 v  v  v  v
 v  v  v  v
 v  v  v  v
 >  >  >  T
