In [30]:
# Author: Yilin ZHENG
# Reference: https://github.com/llSourcell/navigating_a_virtual_world_with_dynamic_programming
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
env = gym.make('FrozenLake-v0')
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [22]:
# Action mapping
action_mapping = {
    0: '\u2191',  # ↑ 
    1: '\u2192',  # →
    2: '\u2193',  # ↓
    3: '\u2190',  # ←
}

In [4]:
# Utilities
def one_step_lookahead(env, state, V, discount):
    n_actions = env.nA
    action_values = np.zeros(shape=n_actions)
    for action in range(n_actions):
        for prob, next_state, reward, done in env.P[state][action]:
            action_values[action] += prob * (reward + discount * V[next_state])
    return action_values

# Exercise 1

## 1. Value Iteration

### Algorithnm:

    choose initial estimate of optimal value function

    repeat until change in values is sufficently small {

        for each state {
    
            calculate the maximum expected value of neighboring state for each possible action
        
            use maximal value of the list to update estimate of optimal value function
      
        } each state

    } convergence

    calculate optimal value frunction from Bellmans' Equation


In [10]:
def value_iteration(env, discount=1.0, theta=1e-9, max_iter=1e9):
    V = np.zeros(env.nS)
    for i in range(int(max_iter)):
        delta = 0
        for state in range(env.nS):
            action_value = one_step_lookahead(env, state, V, discount)
            best_action_value = np.max(action_value)
            delta = max(delta, np.abs(V[state] - best_action_value))
            V[state] = best_action_value
        if delta < theta:
            print(f'Value Iteration converged at iteration #{i}.')
            break
    policy = np.zeros([env.nS, env.nA])
    for state in range(env.nS):
        action_value = one_step_lookahead(env, state, V, discount)
        best_action = np.argmax(action_value)
        policy[state, best_action] = 1.0
    return policy, V

## 2. Policy Iteration

### Alogorithm:

    choose initial policy & value function

    repeat until policy is stable {
      1. Policy evaluation:

      repeat until change in values is sufficiently small {

        for each state {

          calculate the value of neighboring states

          when taking actions according to current policy.

          update estimate of optimal value function.

        } each state

      } converge

      2. Policy improvement:

      New policy according to Bellmans Equation,

      assuming V^* ≈ current V^π

    } policy


In [11]:
def policy_eval(policy, env, discount=1.0, theta=1e-9, max_iter=1e9):
    eval_iter = 1
    V = np.zeros(env.nS)
    for i in range(int(max_iter)):
        delta = 0
        for state in range(env.nS):
            v = 0
            for action, action_prob in enumerate(policy[state]):
                for state_prob, next_state, reward, done in env.P[state][action]:
                    v += action_prob * state_prob * (reward + discount * V[next_state])
            delta = max(delta, np.abs(V[state] - v))
            V[state] = v
        eval_iter += 1
        if delta < theta:
            print(f'Policy evaluated in {eval_iter} iterations.')
            return V

def policy_iteration(env, discount=1.0, max_iter=1e9):
    policy = np.ones([env.nS, env.nA]) / env.nA
    eval_policies = 1
    for i in range(int(max_iter)):
        stable_policy = True
        V = policy_eval(policy, env, discount=discount)
        for state in range(env.nS):
            current_action = np.argmax(policy[state])
            action_value = one_step_lookahead(env, state, V, discount)
            best_action = np.argmax(action_value)
            if current_action != best_action:
                stable_policy = True
            policy[state] = np.eye(env.nA)[best_action]
        eval_policies += 1
        if stable_policy:
            print(f'Evaluated {eval_policies} policies.')
            return policy, V

# Exercise 2

In [27]:
def play_episodes(env, n_episodes, policy):
    wins = 0
    total_reward = 0
    for episode in range(n_episodes):
        done = False
        state = env.reset()
        while not done:
            action = np.argmax(policy[state])
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            state = next_state
            if done and reward == 1.0:
                wins += 1
    average_reward = total_reward / n_episodes
    return wins, total_reward, average_reward

n_episodes = 20000

solvers = [('Value Iteration', value_iteration), ('Policy Iteration', policy_iteration)]

for iter_name, iter_func in solvers:
    env = gym.make('FrozenLake-v0')
    policy, V = iter_func(env.env)
    print(f'Final policy derived by {iter_name}:')
    print(' '.join([action_mapping[action] for action in np.argmax(policy, axis=1)]))
    wins, total_reward, average_reward = play_episodes(env, n_episodes, policy)
    print(f'Number of wins over {n_episodes} episodes: {wins}')
    print(f'Average reward over {n_episodes} episodes: {average_reward}\n')

Value Iteration converged at iteration #523.
Final policy derived by Value Iteration:
↑ ← ← ← ↑ ↑ ↑ ↑ ← → ↑ ↑ ↑ ↓ → ↑
Number of wins over 20000 episodes: 14736
Average reward over 20000 episodes: 0.7368

Policy evaluated in 66 iterations.
Evaluated 2 policies.
Final policy derived by Policy Iteration:
↑ ← ↑ ← ↑ ↑ ↑ ↑ ← → ↑ ↑ ↑ ↓ → ↑
Number of wins over 20000 episodes: 14632
Average reward over 20000 episodes: 0.7316

