In [119]:
import numpy as np

In [120]:
GRID_SIZE=10
TERMINAL_STATES= [(0,5),(0,7),(1,1),(1,2),(2,9),(2,6),(3,3),(3,4),(4,0),(4,8),(5,1),(5,4),(6,7),(6,8),(7,3),(7,5),(8,0),(8,6),(9,2),(9,4),(9,9)]
ACTIONS = {
    0:(-1,-1), #UL
    1:(-1,0),  #U
    2:(-1,1),  #UR
    3:(0,-1),  #L
    4:(0,1),   #R
    5:(1,-1),  #DL
    6:(1,0),   #D
    7:(1,1)    #DR
}
STATE =[(i,j) for i in range(10) for j in range(10)]

In [121]:
def next_state(state,action):
    i,j = state
    p,q = ACTIONS[action]
    if state in TERMINAL_STATES:
        return state
    else:
        if i+p==max(min(i+p,9),0) and j+q==max(min(j+q,9),0):
            return (i+p,j+q)
        else:
            return (i,j)

def reward(i_,j_):
    if (i_,j_)==(9,9):
        return 1
    else:
        return 0

In [122]:
def policy_iteration(gamma=0.9, THETA=0.0001):
    policy = np.zeros(100)
    V = np.zeros((10,10))
    while True:
        
        while True:
            delta=0
            new_V=np.zeros((10,10))
            for s in range(100):
                if STATE[s] in TERMINAL_STATES:
                    continue
                i,j = STATE[s]
                v=0
                for action in ACTIONS : 
                    i_ , j_ = next_state((i,j),action)
                    v+=0.125*(reward(i_,j_)+gamma*V[i_,j_])
                new_V[i,j]=v
            V=new_V.copy()
            delta = max(delta, abs(v-V[i,j]))
            if delta<THETA:
                break

        policy_stable=True
        for s in range(100):
            if STATE[s] in TERMINAL_STATES:
                continue
            i,j = STATE[s]
            old_action = policy[s]
            a_val= []
            for action in ACTIONS:
                i_, j_ = next_state((i, j), action)
                val = reward(i_,j_) + gamma * V[i_, j_]
                a_val.append(val)
            best_action = np.argmax(a_val)
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False
        if policy_stable:
            break
    return policy.reshape((10,10)), V

In [123]:
policy_pi, V_pi = policy_iteration()
print("Policy Iteration Policy:\n", policy_pi)


Policy Iteration Policy:
 [[6. 4. 7. 7. 7. 0. 7. 0. 5. 5.]
 [7. 0. 0. 7. 7. 6. 7. 6. 5. 5.]
 [7. 7. 6. 4. 7. 7. 0. 7. 6. 0.]
 [7. 7. 7. 0. 0. 7. 7. 6. 7. 6.]
 [0. 7. 7. 4. 7. 7. 6. 7. 0. 6.]
 [7. 0. 7. 7. 0. 7. 6. 5. 7. 6.]
 [7. 7. 4. 7. 4. 7. 7. 0. 0. 6.]
 [4. 4. 7. 0. 7. 0. 7. 7. 7. 6.]
 [0. 2. 4. 4. 4. 7. 0. 7. 7. 6.]
 [2. 2. 0. 2. 0. 4. 4. 4. 4. 0.]]


In [124]:
policy_visual = np.empty((10, 10), dtype=str)  
for i in range(10):
    for j in range(10):
        if (i, j) in TERMINAL_STATES:
            policy_visual[i][j] = 'T'
        elif policy_pi[i][j] == 0:
            policy_visual[i][j] = '↖'  
        elif policy_pi[i][j] == 1:
            policy_visual[i][j] = '↑'  
        elif policy_pi[i][j] == 2:
            policy_visual[i][j] = '↗'
        elif policy_pi[i][j] == 3:
            policy_visual[i][j] = '←'
        elif policy_pi[i][j] == 4:
            policy_visual[i][j] = '→'
        elif policy_pi[i][j] == 5:
            policy_visual[i][j] = '↙'
        elif policy_pi[i][j] == 6:
            policy_visual[i][j] = '↓'
        elif policy_pi[i][j] == 7:
            policy_visual[i][j] = '↘'
policy_visual

array([['↓', '→', '↘', '↘', '↘', 'T', '↘', 'T', '↙', '↙'],
       ['↘', 'T', 'T', '↘', '↘', '↓', '↘', '↓', '↙', '↙'],
       ['↘', '↘', '↓', '→', '↘', '↘', 'T', '↘', '↓', 'T'],
       ['↘', '↘', '↘', 'T', 'T', '↘', '↘', '↓', '↘', '↓'],
       ['T', '↘', '↘', '→', '↘', '↘', '↓', '↘', 'T', '↓'],
       ['↘', 'T', '↘', '↘', 'T', '↘', '↓', '↙', '↘', '↓'],
       ['↘', '↘', '→', '↘', '→', '↘', '↘', 'T', 'T', '↓'],
       ['→', '→', '↘', 'T', '↘', 'T', '↘', '↘', '↘', '↓'],
       ['T', '↗', '→', '→', '→', '↘', 'T', '↘', '↘', '↓'],
       ['↗', '↗', 'T', '↗', 'T', '→', '→', '→', '→', 'T']], dtype='<U1')