In [37]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from gymnasium.envs.registration import register
import random
import numpy as np

In [39]:
lake_map = generate_random_map(size=20, p=0.8)

In [40]:
register(
    id="FrozenLake20x20Custom-v1",
    entry_point="gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv",
    kwargs={
        "desc": lake_map,
        "map_name": "FrozenLake20x20Custom",
        "is_slippery": False
    }
)
env = gym.make("FrozenLake20x20Custom-v1", render_mode="ansi")

In [41]:
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f"States: {n_states}, Actions: {n_actions}")
desc = env.unwrapped.desc.astype(str)
print(desc)

States: 400, Actions: 4
[['S' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
  'F' 'F']
 ['F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
  'F' 'H']
 ['F' 'F' 'H' 'F' 'F' 'H' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F'
  'F' 'F']
 ['H' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F' 'H' 'F'
  'H' 'F']
 ['F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F'
  'F' 'F']
 ['F' 'F' 'H' 'F' 'F' 'H' 'F' 'F' 'H' 'F' 'F' 'H' 'F' 'F' 'H' 'H' 'F' 'F'
  'F' 'F']
 ['H' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F'
  'F' 'F']
 ['F' 'F' 'F' 'F' 'F' 'H' 'F' 'H' 'F' 'F' 'F' 'H' 'F' 'H' 'F' 'F' 'H' 'F'
  'F' 'F']
 ['F' 'F' 'H' 'H' 'H' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F'
  'F' 'F']
 ['F' 'F' 'F' 'H' 'H' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
  'F' 'F']
 ['H' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'H' 'H'
  'F' 'F']
 ['F' 'H' 'F' 'H' 'F' 'F' 'F' 'F' 'F' 'F'

In [42]:
def value_iteration(env,gamma=0.9,theta=0.001):
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            q_sa = [sum([p*(r+gamma* V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]]) for a in range(n_actions)]
            max_q = max(q_sa)
            delta = max(delta, abs(max_q-V[s]))
            V[s] = max_q
        if delta <theta:
            break
    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        q_sa = [sum([p * (r + gamma * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]])
                for a in range(n_actions)]
        policy[s] = np.argmax(q_sa)
    return policy, V

In [43]:
def policy_iteration(env, gamma=0.9, theta=0.001):
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    policy = np.random.choice(n_actions, size=n_states)
    V = np.zeros(n_states)
    while True:
        while True:
            delta = 0
            for s in range(n_states):
                a = policy[s]
                v = sum([p * (r + gamma * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]])
                delta = max(delta, abs(v - V[s]))
                V[s] = v
            if delta < theta:
                break
        policy_stable = True
        for s in range(n_states):
            old_action = policy[s]
            q_sa = [sum([p * (r + gamma * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]])
                    for a in range(n_actions)]
            policy[s] = np.argmax(q_sa)
            if old_action != policy[s]:
                policy_stable = False
        if policy_stable:
            break
    return policy, V


In [44]:
policy_vi, V_vi = value_iteration(env)
print("Value Iteration Policy:\n", policy_vi.reshape((20, 20)))

policy_pi, V_pi = policy_iteration(env)
print("Policy Iteration Policy:\n", policy_pi.reshape((20, 20)))

Value Iteration Policy:
 [[1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0]
 [1 1 2 1 1 2 1 2 1 1 1 1 1 0 2 2 2 1 0 0]
 [2 1 0 1 1 0 1 0 1 1 1 2 1 0 0 3 0 1 0 1]
 [0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1]
 [1 1 2 1 1 2 1 2 2 1 1 2 1 0 0 0 1 1 1 0]
 [2 1 0 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0]
 [0 1 2 2 2 2 1 0 1 1 1 2 1 0 1 0 2 1 1 0]
 [1 1 0 2 3 0 1 0 2 1 1 0 1 0 1 0 0 1 1 0]
 [1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0]
 [2 1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 2 2 1 0]
 [0 2 1 2 1 1 1 0 1 1 2 1 1 1 1 0 0 0 1 0]
 [1 0 1 0 1 1 1 1 1 1 0 2 1 1 1 0 0 1 0 0]
 [2 1 1 1 1 1 1 1 1 1 1 0 2 1 1 1 0 1 0 0]
 [0 2 2 2 2 2 2 2 2 2 2 1 0 1 2 2 2 1 0 0]
 [2 2 2 2 2 2 2 3 0 3 0 2 1 1 0 3 0 1 0 0]
 [2 2 2 3 0 2 2 2 2 3 0 0 2 1 0 0 0 2 1 0]
 [3 0 2 3 0 0 0 3 0 3 0 0 0 2 2 2 1 0 1 0]
 [2 2 2 2 2 2 2 3 0 0 3 0 0 2 3 0 1 1 1 1]
 [0 2 2 2 2 2 2 3 0 2 3 0 0 3 0 2 2 1 1 1]
 [2 3 0 2 2 2 2 3 0 3 0 3 0 3 0 0 0 2 2 0]]
Policy Iteration Policy:
 [[1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0]
 [1 1 2 1 1 2 1 2 1 1 1 1 1 0 2 2 2 1 0 0]
 [

In [45]:
def evaluate_policy(env, policy, episodes=100):
    total_rewards = 0
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        while not done:
            action = policy[obs]
            obs, reward, done, truncated, info = env.step(action)
            total_rewards += reward
    return total_rewards / episodes

print("VI Avg Reward:", evaluate_policy(env, policy_vi))
print("PI Avg Reward:", evaluate_policy(env, policy_pi))

VI Avg Reward: 1.0
PI Avg Reward: 1.0
