# Find the optimal path for our agent in FrozenLake Problem

# Value Iteration

In [1]:
import gym
import numpy as np

# Initialising gym environment

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


# Value Iteration Function

In [4]:
def value_iteration(env,gamma= 1.0):
    value_table = np.zeros(env.observation_space.n)
    
    no_of_iteration = 10000
    thershold = 1e-20
    
    for i in range(no_of_iteration):
        updated_value_table = np.copy(value_table)
        
        for state in range(env.observation_space.n):
            Q_value = []
            
            for action in range(env.action_space.n):
                next_state_reward = []
                
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _= next_sr
                    next_state_reward.append((trans_prob * (reward_prob + gamma* updated_value_table[next_state])))
                
                Q_value.append(np.sum(next_state_reward))
            
            value_table[state] = max(Q_value)
        
        if(np.sum(np.fabs(updated_value_table - value_table))<= thershold):
            print("value Iteration converged at iterations %d." %(i+1))
            break
    return value_table


In [9]:
optimal_value_function=value_iteration(env)

value Iteration converged at iterations 1373.


In [10]:
optimal_value_function

array([0.82352941, 0.82352941, 0.82352941, 0.82352941, 0.82352941,
       0.        , 0.52941176, 0.        , 0.82352941, 0.82352941,
       0.76470588, 0.        , 0.        , 0.88235294, 0.94117647,
       0.        ])

# Extracting the policy

In [5]:
def extract_policy(value_table, gamma = 1.0):
    policy = np.zeros(env.observation_space.n)
    
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.action_space.n)
        
        # Compute Q value for all the actions in the state
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _= next_sr
                Q_table[action] +=(trans_prob * (reward_prob + gamma * value_table[next_state]))
                
                policy[state] = np.argmax(Q_table)
    return policy

In [6]:
optimal_value_function = value_iteration(env = env, gamma = 1.0)

value Iteration converged at iterations 1373.


In [7]:
optimal_policy = extract_policy(optimal_value_function, gamma = 1.0)

In [8]:
optimal_policy

array([0., 3., 3., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])