In [113]:
import numpy as np

In [214]:
num_states = 6
num_actions = 2

terminal_left_reward = 100
terminal_right_reward = 40
each_step_reward = 0
gamma = 0.5 # discount factor
misstep_prob = 0 # probability of going in the wrong direction

def generate_visualization(
    terminal_left_reward,
    terminal_right_reward,
    each_step_reward,
    gamma,
    misstep_prob,
    ):

    reward_steps = np.array([terminal_left_reward] + 
                            [each_step_reward]*(num_states - 2) + 
                            [terminal_right_reward]).astype(np.float32)
    # find optimal policy
    always_left = []
    always_right = []

    def left_or_right(action, current_state):
        action = action
        current_state = current_state
        if action == 'left':
            # all the way to the left
            N = len(reward_steps[:current_state])
            reward_state = reward_steps[:current_state][::-1]
            discount_factors = np.array([gamma**i for i in range(N)])
            return_allLeft = np.sum(reward_state * discount_factors)
            # after going left, there is a chance of going right one time
            # plus one time
            if current_state+1 == num_states:
                return return_allLeft, 0
            else:
                discount_factors = np.array([gamma**i for i in range(N+2)])
                reward_state = [0]*2 + list(reward_state)
                return_rightLeft = np.sum(reward_state * discount_factors)
                return return_allLeft, return_rightLeft

        else:
            # all the way to the right
            N = len(reward_steps[current_state-1:])
            reward_state = reward_steps[current_state-1:]
            discount_factors = np.array([gamma**i for i in range(N)])
            return_allRight = np.sum(reward_state * discount_factors)
            # after going right, there is a chance of going left one time
            # plus one time
            if current_state-1 == 0:
                return return_allRight, 0
            else:
                discount_factors = np.array([gamma**i for i in range(N+2)])
                reward_state = [0]*2 + list(reward_state)
                return_leftRight = np.sum(reward_state * discount_factors)
                return return_allRight, return_leftRight
    
    optimal_policy = []
    actions = []
    for i in range(1, num_states+1):

        if i == 1:
            always_left.append(terminal_left_reward)
            always_right.append(terminal_left_reward)
            actions.append('None')
        elif i == num_states:
            always_left.append(terminal_right_reward)
            always_right.append(terminal_right_reward)
            actions.append('None')
        else:
            return_allLeft, return_rightLeft = left_or_right('left', i)
            return_allRight, return_leftRight = left_or_right('right', i)
            always_left.append(np.max([return_allLeft, return_leftRight])) # compare lefts
            always_right.append(np.max([return_allRight, return_rightLeft])) # compare rights
            actions.append('left' if np.argmax([always_left[-1], always_right[-1]]) == 0 else 'right')

        optimal_policy.append(np.max([always_left[-1], always_right[-1]]))        
    
    print(f'----------------- optimal policy -----------------')
    print(f'optimal policy: {optimal_policy}')
    print(f'actions: {actions}')
    print(f'step rewards: {reward_steps}')

    print(f'----------------- state-action value function (Q(s, a)) -----------------')
    left_list = [float("{:0.2f}".format(a)) for a in always_left]
    right_list = [float("{:0.2f}".format(a)) for a in always_right]
    state_action = [(i1, i2) for i1, i2 in zip(left_list, right_list)]
    print(f'state-action: {state_action}')
    print(f'step rewards: {reward_steps}')

In [215]:
generate_visualization(
    terminal_left_reward,
    terminal_right_reward,
    each_step_reward,
    gamma,
    misstep_prob,
)

----------------- optimal policy -----------------
optimal policy: [100, 50.0, 25.0, 12.5, 20.0, 40]
actions: ['None', 'left', 'left', 'left', 'right', 'None']
step rewards: [100.   0.   0.   0.   0.  40.]
----------------- state-action value function (Q(s, a)) -----------------
state-action: [(100.0, 100.0), (50.0, 12.5), (25.0, 6.25), (12.5, 10.0), (6.25, 20.0), (40.0, 40.0)]
step rewards: [100.   0.   0.   0.   0.  40.]
