In [1]:
import gym 
import numpy as np
#Just load and check once
env = gym.make("FrozenLake-v0")
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [2]:
# S is the initial stage
# G is the Goal 
# F is frozen part of the lake we can walk over
# H is the hole


# Lets look at the action space and the observation space
print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)

Action space:  Discrete(4)
Observation space:  Discrete(16)


In [3]:
SZ_ACTION_SPACE = env.action_space.n
SZ_OBS_SPACE = env.observation_space.n
print("Size of Action Space is ",SZ_ACTION_SPACE)
print("Size of Observation space is ",SZ_OBS_SPACE)

Size of Action Space is  4
Size of Observation space is  16


In [4]:
#From the documentation 
actions = {
    'Left': 0,
    'Down': 1,
    'Right': 2, 
    'Up': 3
}

In [24]:
def run_episode(env, policy, gamma = 1.0, render = False):
    
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        
        if done:
            print("Took ",step_idx," Steps.")
            break
    return total_reward

In [17]:
def evaluate_policy(env, policy, gamma = 1.0,  n = 100):
    scores = [run_episode(env, policy, gamma = gamma, render = False) for _ in range(n)]
    return np.mean(scores)


In [18]:
def extract_policy(v, gamma = 1.0):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

In [19]:
def value_iteration(env, gamma = 1.0):
    v = np.zeros(env.nS)
    max_iterations = 100000
    eps = 1e-20
    for i in range(max_iterations):
        prev_v = np.copy(v)
        for s in range(env.nS):
            q_sa = [sum([p*(r + prev_v[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)] 
            v[s] = max(q_sa)
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break
    return v

In [20]:
env_name  = 'FrozenLake8x8-v0'
gamma = 1.0
env = gym.make(env_name)
optimal_v = value_iteration(env, gamma);
policy = extract_policy(optimal_v, gamma)
policy_score = evaluate_policy(env, policy, gamma, n=1000)
print('Policy average score = ', policy_score)

Value-iteration converged at iteration# 2357.
Policy average score =  0.853


In [42]:
print("Accuracy : ",evaluate_policy(env,policy,gamma,100))

Took  200  Steps.
Took  153  Steps.
Took  118  Steps.
Took  196  Steps.
Took  18  Steps.
Took  85  Steps.
Took  145  Steps.
Took  200  Steps.
Took  107  Steps.
Took  79  Steps.
Took  94  Steps.
Took  103  Steps.
Took  106  Steps.
Took  74  Steps.
Took  67  Steps.
Took  67  Steps.
Took  136  Steps.
Took  128  Steps.
Took  54  Steps.
Took  89  Steps.
Took  55  Steps.
Took  104  Steps.
Took  78  Steps.
Took  200  Steps.
Took  200  Steps.
Took  125  Steps.
Took  65  Steps.
Took  93  Steps.
Took  54  Steps.
Took  35  Steps.
Took  70  Steps.
Took  160  Steps.
Took  75  Steps.
Took  68  Steps.
Took  84  Steps.
Took  107  Steps.
Took  181  Steps.
Took  135  Steps.
Took  93  Steps.
Took  60  Steps.
Took  90  Steps.
Took  139  Steps.
Took  166  Steps.
Took  60  Steps.
Took  27  Steps.
Took  67  Steps.
Took  131  Steps.
Took  200  Steps.
Took  200  Steps.
Took  116  Steps.
Took  79  Steps.
Took  125  Steps.
Took  145  Steps.
Took  200  Steps.
Took  78  Steps.
Took  87  Steps.
Took  197  Steps.
To

## The maximum number of steps above are 200 as the Env automatically terminates after 200 moves so that it doesn't infinitely render.