In [162]:
import numpy as np


blocked_State = [(1,1)]

possible_actions = ['L','R','U','D']
terminate_states = ((1,3),(2,3))


# non - deterministic action
prob_actions = {'L':0.25,'R':0.25,'U':0.25,'D':0.25}

# environment action corresponding to Agent
environment_left = {'L':'D','R':'U','U':'L','D':'R'}
environment_right = {'L':'U','R':'D','U':'R','D':'L'}

def is_valid(i,j):
    return (i,j) not in blocked_State and i >= 0 and i < 3 and j >= 0 and j < 4

def print_values(V):
  for i in range(2,-1,-1):
    print("--- --- --- --- --- --- ---")
    for j in range(4):
      v = V[i][j]
      if v >= 0:
        print(" %.2f|" % v, end="")
      else:
        print("%.2f|" % v, end="") # -ve sign takes up an extra space
    print("")

def func(action,i,j):
    if action == 'L':
        new_state = (i,j-1)
    elif action == 'R':
        new_state = (i,j+1)
    elif action == 'U':
        new_state = (i+1,j)
    else:
        new_state = (i-1,j)   

    return new_state

def find_value_function(i,j,reward,reward_matrix,tolerance_rate=1):
    value = 0
    for action in possible_actions:
        # desired action with 0.8 probability
        state_x,state_y = func(action,i,j)
        if is_valid(state_x,state_y):
            desired_action_value = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])
        else:
            desired_action_value = (reward_matrix[i][j] + tolerance_rate*V_pie[i][j])
        
        # environment action with 0.1 probability
        state_x,state_y = func(environment_left[action],i,j)
        if is_valid(state_x,state_y):
            env_action_left_value = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])
        else:
            env_action_left_value = (reward_matrix[i][j] + tolerance_rate*V_pie[i][j])
        
        # environment action with 0.1 probability 
        state_x,state_y = func(environment_right[action],i,j)
        if is_valid(state_x,state_y):
            env_action_right_value = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])
        else:
            env_action_right_value = (reward_matrix[i][j] + tolerance_rate*V_pie[i][j])
        
        value_to_action = desired_action_value*0.8+env_action_left_value*0.1+env_action_right_value*0.1        

        value += value_to_action*prob_actions[action]

    return value

def getPolicy(reward_matrix, cell, V_pie,  tolerance_rate = 1):
    
    Q_val = [0]*4
    actions = ['L', 'R', 'U', 'D']
    
    
    for index, action in enumerate(actions):
        state_x, state_y = func(action, cell[0], cell[1])
        currQ_Val = 0
        if isValid(state_x,state_y):
            currQ_Val = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])*0.8
        else:
            currQ_Val = (reward_matrix[cell[0]][cell[1]] + tolerance_rate*V_pie[cell[0]][cell[1]])*0.8
        
        state_x,state_y = func(environment_left[action],cell[0],cell[1])
        if isValid(state_x,state_y):
            currQ_Val = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])*0.1
        else:
            currQ_Val = (reward_matrix[cell[0]][cell[1]] + tolerance_rate*V_pie[cell[0]][cell[1]])*0.1
        
        # environment action with 0.1 probability 
        state_x,state_y = func(environment_right[action],cell[0],cell[1])
        if is_valid(state_x,state_y):
            currQ_Val = (reward_matrix[state_x][state_y] + tolerance_rate*V_pie[state_x][state_y])*0.1
        else:
            currQ_Val = (reward_matrix[cell[0]][cell[1]] + tolerance_rate*V_pie[cell[0]][cell[1]])*0.1
        
        Q_val[index] = currQ_Val
        
    maxValIndex = np.argmax(Q_val)
    return actions[maxValIndex]        
    

# iterative policy evaluation
def iterative_policy_evaluation(iter,theta,reward,reward_matrix,V_pie):
    while True:
        delta = 0
        for i in range(3):
            for j in range(4):
                state = (i,j)
                if state in terminate_states or state in blocked_State:
                    continue
                v = V_pie[i][j]
                V_pie[i][j] = find_value_function(i,j,reward,reward_matrix)
                delta = max(delta,abs(v-V_pie[i][j]))
        iter += 1
        if delta < theta:
            print(f"Total Iterations :{iter}")
            break 
            
    actionsMatrix = [["" for i in range(4)] for j in range(3)]
    for i in range(3):
        for j in range(4):
            action = getPolicy(reward_matrix, [i, j], V_pie)
            actionsMatrix[i][j] = action
    print_values(V_pie)

def update_reward_matrix(reward):
    reward_matrix = [[reward for _ in range(4)] for _ in range(3)]
    reward_matrix[2][3] = 1
    reward_matrix[1][3] = -1
    return reward_matrix
    
def initialize_V_pie():
    V_pie = [[0 for _ in range(4)]for _ in range(3)]
    return V_pie    


rewards = [-0.04,-2,0.1,0.02,1]

if __name__ == "__main__":
    for reward in rewards:
        print(f"At r(S) : {reward}")
        reward_matrix = update_reward_matrix(reward)
        V_pie = initialize_V_pie()
        iterative_policy_evaluation(0,1e-7,reward,reward_matrix,V_pie)
        print("\n************************************\n")

At r(S) : -0.04
Total Iterations :267
[['U', 'U', 'L', 'L'], ['L', 'L', 'L', 'L'], ['U', 'U', 'U', 'L']]
--- --- --- --- --- --- ---
-1.23|-0.83|-0.28| 0.00|
--- --- --- --- --- --- ---
-1.47| 0.00|-0.87| 0.00|
--- --- --- --- --- --- ---
-1.55|-1.47|-1.22|-1.17|

************************************

At r(S) : -2
Total Iterations :339
[['U', 'U', 'U', 'L'], ['L', 'U', 'U', 'L'], ['U', 'U', 'U', 'L']]
--- --- --- --- --- --- ---
-59.71|-46.01|-24.32| 0.00|
--- --- --- --- --- --- ---
-65.41| 0.00|-21.94| 0.00|
--- --- --- --- --- --- ---
-63.10|-52.80|-34.49|-20.75|

************************************

At r(S) : 0.1
Total Iterations :279
[['L', 'D', 'D', 'D'], ['U', 'D', 'L', 'L'], ['R', 'D', 'D', 'D']]
--- --- --- --- --- --- ---
 2.95| 2.39| 1.44| 0.00|
--- --- --- --- --- --- ---
 3.10| 0.00| 0.63| 0.00|
--- --- --- --- --- --- ---
 2.85| 2.20| 1.15| 0.23|

************************************

At r(S) : 0.02
Total Iterations :239
[['L', 'D', 'D', 'D'], ['L', 'L', 'L', 'L'], ['L',