In [2]:
import numpy as np
import gymnasium as gym

In [3]:
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

In [33]:
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode= "ansi", desc=generate_random_map(size=8))

In [34]:
# Initialize variables
max_iterations = 10000  # Max number of iterations
gamma = 0.99  # Discount factor
theta = 1e-6  # Convergence threshold
num_states = env.observation_space.n
num_actions = env.action_space.n
p_kernel = env.env.env.env.P
V = np.zeros(num_states)  # Value function

In [35]:
print(env.env.env.env.__dict__)

{'desc': array([[b'S', b'F', b'F', b'F', b'F', b'F', b'H', b'F'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F'],
       [b'F', b'F', b'H', b'H', b'F', b'H', b'F', b'F'],
       [b'F', b'F', b'F', b'F', b'H', b'F', b'F', b'H'],
       [b'H', b'F', b'F', b'F', b'H', b'H', b'F', b'F'],
       [b'H', b'H', b'F', b'H', b'F', b'H', b'F', b'F'],
       [b'F', b'H', b'H', b'F', b'F', b'H', b'F', b'G']], dtype='|S1'), 'nrow': 8, 'ncol': 8, 'reward_range': (0, 1), 'initial_state_distrib': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'P': {0: {0: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, False)], 1: [(0.3333333333333333, 0, 0.0, Fals

In [36]:
def value_iteration():
    """Write value iteration function here."""
    global V
    
    for _ in range(max_iterations):
        delta = 0
        new_V = np.zeros(num_states)
        for s in range(num_states):
            # Q(s, a) = Expected reward + Expected future reward for taking action a in state s
            # Right now, we are in state s, so q_values = [Q(s, 0), Q(s, 1), ..., Q(s, num_actions)]
            q_values = np.zeros(num_actions)
            for a in range(num_actions):
                for prob, next_state, reward, done in p_kernel[s][a]:
                    q_values[a] += prob * (reward + gamma * V[next_state])
            new_V[s] = np.max(q_values)
            delta = max(delta, np.abs(new_V[s] - V[s]))
        
        V = new_V
        
        if delta < theta:
            print(delta)
            break

In [37]:
value_iteration()

9.753822059754835e-07


In [38]:
def extract_policy():
    """Write code to extract the optimal policy from the value function."""
    policy = np.zeros(num_states, dtype=int)
    for s in range(num_states):
        q_values = np.zeros(num_actions)
        for a in range(num_actions):
            for prob, next_state, reward, done in p_kernel[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state])
        policy[s] = np.argmax(q_values)
    
    return policy

In [39]:
policy = extract_policy()
print("Optimal Policy:")
print(policy.reshape((8, 8)))


Optimal Policy:
[[3 2 2 2 2 0 0 2]
 [3 2 2 2 2 2 1 1]
 [3 3 3 3 3 3 2 1]
 [3 0 0 0 0 0 2 3]
 [3 3 1 0 0 1 0 0]
 [0 2 3 0 0 0 2 1]
 [0 0 0 0 0 0 2 1]
 [0 0 0 0 0 0 2 0]]


In [40]:
env.reset()
print(env.render())


[41mS[0mFFFFFHF
FFFFFFFF
FFFFFFFF
FFHHFHFF
FFFFHFFH
HFFFHHFF
HHFHFHFF
FHHFFHFG



In [41]:
V.reshape((8, 8)).round(3)

array([[0.187, 0.193, 0.2  , 0.208, 0.214, 0.218, 0.   , 0.239],
       [0.187, 0.192, 0.199, 0.207, 0.217, 0.228, 0.242, 0.246],
       [0.185, 0.189, 0.195, 0.204, 0.215, 0.231, 0.258, 0.258],
       [0.169, 0.158, 0.   , 0.   , 0.071, 0.   , 0.282, 0.266],
       [0.143, 0.121, 0.066, 0.03 , 0.   , 0.109, 0.331, 0.   ],
       [0.   , 0.056, 0.049, 0.026, 0.   , 0.   , 0.612, 0.737],
       [0.   , 0.   , 0.016, 0.   , 0.   , 0.   , 0.786, 0.884],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.884, 0.   ]])