In [1]:
import gym 
import numpy as np
#Just load and check once
env = gym.make("FrozenLake-v0")
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [2]:
# S is the initial stage
# G is the Goal 
# F is frozen part of the lake we can walk over
# H is the hole


# Lets look at the action space and the observation space
print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)

Action space:  Discrete(4)
Observation space:  Discrete(16)


In [3]:
SZ_ACTION_SPACE = env.action_space.n
SZ_OBS_SPACE = env.observation_space.n
print("Size of Action Space is ",SZ_ACTION_SPACE)
print("Size of Observation space is ",SZ_OBS_SPACE)

Size of Action Space is  4
Size of Observation space is  16


In [4]:
#From the documentation 
actions = {
    'Left': 0,
    'Down': 1,
    'Right': 2, 
    'Up': 3
}

In [5]:
def run_episode(env, policy, gamma = 1.0, render = False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward

In [6]:
def evaluate_policy(env, policy, gamma = 1.0, n = 100):
    scores = [run_episode(env, policy, gamma, False) for _ in range(n)]
    return np.mean(scores)

In [7]:
def extract_policy(v, gamma = 1.0):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.nA)
        for a in range(env.nA):
            q_sa[a] = sum([p * (r + gamma * v[s_]) for p, s_, r, _ in  env.P[s][a]])
        policy[s] = np.argmax(q_sa)
    return policy

In [8]:
def compute_policy_v(env, policy, gamma=1.0):
    v = np.zeros(env.nS)
    eps = 1e-10
    while True:
        prev_v = np.copy(v)
        for s in range(env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, _ in env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            # value converged
            break
    return v

In [9]:
def policy_iteration(env, gamma = 1.0):
    policy = np.random.choice(env.nA, size=(env.nS))  # initialize a random policy
    max_iterations = 200000
    gamma = 1.0
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            print ('Policy-Iteration converged at step %d.' %(i+1))
            break
        policy = new_policy
    return policy

In [16]:
env_name  = 'FrozenLake8x8-v0'
env = gym.make(env_name)
optimal_policy = policy_iteration(env, gamma = 1.0)
scores = evaluate_policy(env, optimal_policy, gamma = 1.0,n = 10000)
print('Average scores = ', np.mean(scores))

Policy-Iteration converged at step 12.
Average scores =  0.8811
