Value Iteration:

Helps find optimal policy: takes action at each state that maximizes return.
In order to achieve this, we need to find the optimal values of the states that may follow the present state. At the beginning of the algorithm, we estimate the values of all states. Then, we iterate through all the states that are not the goal state, updating their values according to the formula: $$V(s) = \max_{a \in A} \left\{ \sum_{s'r \in S} p(s',r|s,a) \left[ r + \gamma V(s') \right] \right\}$$




In [None]:
state_values = np.zeros(shape=(5,5))

In [None]:
def value_iteration(policy_probs, state_values, theta=1e-6, gamma=0.99):
    delta = float("inf")
    while delta > theta:
        delta = 0
        for row in range(5):
            for col in range(5):
                old_value = state_values[(row, col)]
                action_probs = None
                max_qsa = float("-inf")
                for action in range(4):
                    next_state, reward, _, _ = env.simulate_step((row, col), action)
                    qsa = reward + gamma * state_values[next_state]

                    if qsa > max_qsa:
                      max_qsa = qsa
                      action_probs = np.zeros(4)
                      action_probs[action] = 1.

                state_values[(row, col)] = max_qsa
                policy_probs[(row, col)] = action_probs

                delta = max(delta, abs(max_qsa - old_value))

Policy Iteration: a process that alternately improves the estimated values and the policy

In [None]:
def policy_evaluation(policy_probs,state_values,theta=1e-6,gamma=0.99):
    delta = float('inf')
    while delta > theta:
        delta = 0

        for row in range(5):
            for col in range(5):
                old_value = state_values[(row,col)]
                new_value = 0.
                action_probabilities = policy_probs[(row,col)]

                for action,prob in enumerate(action_probabilities):
                    next_state,reward,_,_ = env.simulate_step((row,col),action)
                    new_value += prob * (reward + gamma * state_values[next_state])

                state_values[(row,col)] = new_value

                delta = max(delta,abs(old_value-new_value))

In [None]:
def policy_improvement(policy_probs,state_values,gamma=0.99):

    policy_stable = True

    for row in range(5):
        for col in range(5):
            old_action = policy_probs[(row,col)].argmax()

            new_action = None
            max_qsa = float('-inf')

            for action in range(4):
                next_state,reward,_,_ = env.simulate_step((row,col),action)
                qsa = reward + gamma * state_values[next_state]

                if qsa > max_qsa:
                    new_action = action
                    max_qsa = qsa

            action_probs = np.zeros(4)
            action_probs[new_action] = 1.
            policy_probs[(row,col)] = action_probs

            if new_action != old_action:
                policy_stable = False

    return policy_stable


In [None]:
def policy_iteration(policy_probs,state_values,theta=1e-6,gamma=0.99):
    policy_stable = False

    while not policy_stable:

        policy_evaluation(policy_probs,state_values,theta,gamma)
        plot_values(state_values,frame)

        policy_stable = policy_improvement(policy_probs,state_values,gamma)
        plot_policy(policy_probs,frame)

Essentially dynamic programming is finding the result of taking every action in every state in advance, without having to perform the action