In [3]:
import gym 
import numpy as np
#Just load and check once
env = gym.make("FrozenLake-v0")
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
# S is the initial stage
# G is the Goal 
# F is frozen part of the lake we can walk over
# H is the hole


# Lets look at the action space and the observation space
print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)

Action space:  Discrete(4)
Observation space:  Discrete(16)


In [5]:
SZ_ACTION_SPACE = env.action_space.n
SZ_OBS_SPACE = env.observation_space.n
print("Size of Action Space is ",SZ_ACTION_SPACE)
print("Size of Observation space is ",SZ_OBS_SPACE)

Size of Action Space is  4
Size of Observation space is  16


In [6]:
#From the documentation 
actions = {
    'Left': 0,
    'Down': 1,
    'Right': 2, 
    'Up': 3
}

In [7]:
def run_episode(env, policy, gamma = 1.0, render = False):
    
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        
        if done:
            print("Took ",step_idx," Steps.")
            break
    return total_reward

In [8]:
def evaluate_policy(env, policy, gamma = 1.0,  n = 100):
    scores = [run_episode(env, policy, gamma = gamma, render = False) for _ in range(n)]
    return np.mean(scores)


In [17]:
def extract_policy(v, gamma = 1.0):
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for next_sr in env.P[s][a]:
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

In [18]:
def value_iteration(env, gamma = 1.0):
    v = np.zeros(env.nS)
    max_iterations = 100000
    eps = 1e-20
    for i in range(max_iterations):
        prev_v = np.copy(v)
        for s in range(env.nS):
            q_sa = [sum([p*(r + prev_v[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.nA)] 
            v[s] = max(q_sa)
        if (np.sum(np.fabs(prev_v - v)) <= eps):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break
    return v

In [19]:
env_name  = 'FrozenLake8x8-v0'
gamma = 1.0
env = gym.make(env_name)
optimal_v = value_iteration(env, gamma);
policy = extract_policy(optimal_v, gamma)
policy_score = evaluate_policy(env, policy, gamma, n=1000)
print('Policy average score = ', policy_score)

Value-iteration converged at iteration# 2357.
Took  191  Steps.
Took  113  Steps.
Took  121  Steps.
Took  164  Steps.
Took  110  Steps.
Took  98  Steps.
Took  148  Steps.
Took  55  Steps.
Took  91  Steps.
Took  68  Steps.
Took  80  Steps.
Took  200  Steps.
Took  68  Steps.
Took  106  Steps.
Took  53  Steps.
Took  134  Steps.
Took  200  Steps.
Took  61  Steps.
Took  85  Steps.
Took  101  Steps.
Took  108  Steps.
Took  153  Steps.
Took  136  Steps.
Took  156  Steps.
Took  82  Steps.
Took  83  Steps.
Took  128  Steps.
Took  79  Steps.
Took  200  Steps.
Took  166  Steps.
Took  200  Steps.
Took  57  Steps.
Took  190  Steps.
Took  135  Steps.
Took  200  Steps.
Took  171  Steps.
Took  188  Steps.
Took  66  Steps.
Took  73  Steps.
Took  34  Steps.
Took  70  Steps.
Took  90  Steps.
Took  130  Steps.
Took  98  Steps.
Took  63  Steps.
Took  74  Steps.
Took  95  Steps.
Took  56  Steps.
Took  151  Steps.
Took  140  Steps.
Took  31  Steps.
Took  94  Steps.
Took  37  Steps.
Took  35  Steps.
Took  76 

Took  200  Steps.
Took  128  Steps.
Took  48  Steps.
Took  141  Steps.
Took  35  Steps.
Took  64  Steps.
Took  41  Steps.
Took  64  Steps.
Took  56  Steps.
Took  127  Steps.
Took  130  Steps.
Took  88  Steps.
Took  28  Steps.
Took  51  Steps.
Took  49  Steps.
Took  200  Steps.
Took  72  Steps.
Took  59  Steps.
Took  82  Steps.
Took  99  Steps.
Took  83  Steps.
Took  75  Steps.
Took  200  Steps.
Took  163  Steps.
Took  200  Steps.
Took  98  Steps.
Took  112  Steps.
Took  141  Steps.
Took  37  Steps.
Took  200  Steps.
Took  139  Steps.
Took  104  Steps.
Took  155  Steps.
Took  54  Steps.
Took  157  Steps.
Took  92  Steps.
Took  104  Steps.
Took  161  Steps.
Took  49  Steps.
Took  132  Steps.
Took  200  Steps.
Took  76  Steps.
Took  58  Steps.
Took  152  Steps.
Took  145  Steps.
Took  61  Steps.
Took  200  Steps.
Took  120  Steps.
Took  110  Steps.
Took  76  Steps.
Took  200  Steps.
Took  52  Steps.
Took  135  Steps.
Took  84  Steps.
Took  94  Steps.
Took  97  Steps.
Took  175  Steps.
Too

In [20]:
print("Accuracy : ",evaluate_policy(env,policy,gamma,100))

Took  200  Steps.
Took  79  Steps.
Took  200  Steps.
Took  135  Steps.
Took  200  Steps.
Took  77  Steps.
Took  146  Steps.
Took  103  Steps.
Took  200  Steps.
Took  130  Steps.
Took  118  Steps.
Took  56  Steps.
Took  71  Steps.
Took  76  Steps.
Took  184  Steps.
Took  200  Steps.
Took  118  Steps.
Took  65  Steps.
Took  72  Steps.
Took  182  Steps.
Took  175  Steps.
Took  146  Steps.
Took  93  Steps.
Took  200  Steps.
Took  51  Steps.
Took  200  Steps.
Took  135  Steps.
Took  94  Steps.
Took  150  Steps.
Took  67  Steps.
Took  54  Steps.
Took  200  Steps.
Took  71  Steps.
Took  122  Steps.
Took  48  Steps.
Took  83  Steps.
Took  116  Steps.
Took  79  Steps.
Took  168  Steps.
Took  173  Steps.
Took  89  Steps.
Took  42  Steps.
Took  63  Steps.
Took  143  Steps.
Took  53  Steps.
Took  200  Steps.
Took  194  Steps.
Took  27  Steps.
Took  134  Steps.
Took  105  Steps.
Took  146  Steps.
Took  133  Steps.
Took  74  Steps.
Took  60  Steps.
Took  200  Steps.
Took  200  Steps.
Took  200  Step

## The maximum number of steps above are 200 as the Env automatically terminates after 200 moves so that it doesn't infinitely render.