In [2]:
import gym
import numpy as np

In [3]:
env = gym.make('FrozenLake-v0')

[2018-10-18 22:45:44,947] Making new env: FrozenLake-v0


In [16]:
env = env.unwrapped

In [4]:
print(env.observation_space.n)

16


In [6]:
print(env.action_space.n)

4


In [None]:
# define value_iteration() function returns optimal value function - the value table

In [9]:
value_table = np.zeros(env.observation_space.n)
no_of_iterations = 100000
gamma = 0.1

In [21]:
for i in range(no_of_iterations):
    updated_value_table = np.copy(value_table)

In [22]:
for state in range(env.observation_space.n):
    Q_value = []
    for action in range(env.action_space.n):
        next_states_rewards = []
        for next_sr in env.P[state][action]:
            trans_prob, next_state, reward_prob, _ = next_sr 
            next_states_rewards.append((trans_prob * (reward_prob + gamma * updated_value_table[next_state])))
            Q_value.append(np.sum(next_states_rewards))
    value_table[state] = max(Q_value)

In [None]:
# Optimal value function

In [28]:
def value_iteration(env):
    value_table = np.zeros(env.observation_space.n)
    no_of_iterations = 100000
    gamma = 0.1
    threshold = 1e-20
    for i in range(no_of_iterations):
        updated_value_table = np.copy(value_table)
        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_sr 
                    next_states_rewards.append((trans_prob * (reward_prob + gamma * updated_value_table[next_state])))
                    Q_value.append(np.sum(next_states_rewards))
            value_table[state] = max(Q_value)
        if np.sum(np.fabs(updated_value_table - value_table)) <= threshold:
            print('Value-iteration converged')
            break
    return(value_table, Q_value)

In [30]:
vt, qv = value_iteration(env=env)

Value-iteration converged


In [27]:
# Optimal policy function
# the purpose of this function is to extract the optimal policy from the above funcion
# the above function calculates optimal Q values + pick up actions with highest q values

In [33]:
def extract_policy(env, value_table):
    gamma = 0.1

    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.observation_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_sr
                Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    return(policy)
        
the_policy = extract_policy(env=env, value_table=value_table)


In [34]:
the_policy

array([1., 3., 2., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])

In [None]:
# After extracting the policy from the q value table
# How do we evaluate the policy is actually good?
# We need to do policy iteration. 

In [43]:
def policy_iteration(env):
    random_policy = np.zeros(env.observation_space.n)
    no_of_iterations = 20000 
    gamma = 1.0
    for i in range(no_of_iterations):
        new_val_table, new_qval = value_iteration(env=env)
        new_policy = extract_policy(env=env, value_table=new_val_table)
        # When the new policy is the same as policy from last step convergence is reached
        if (np.all(random_policy == new_policy)): 
            print("Policy iteration converged")
            break
        random_policy = new_policy # update policy at n-1
    return(new_policy)

output = policy_iteration(env=env)

Value-iteration converged
Value-iteration converged
Policy iteration converged


In [44]:
output 

array([1., 3., 2., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])