In [5]:
import numpy as np
import gym

def value_iteration(env, gamma=0.99, theta=1e-8):
    """
    Perform Value Iteration to find the optimal policy.

    Parameters:
    env: The OpenAI Gym environment.
    gamma: Discount factor.
    theta: Threshold for convergence.

    Returns:
    policy: Optimal policy.
    V: Optimal state-value function.
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    # Initialize state-value function to zeros
    V = np.zeros(n_states)

    directions = ['<', 'v', '>', '^']  # Left, Down, Right, Up

    while True:
        delta = 0
        policy_matrix = np.full((int(np.sqrt(n_states)), int(np.sqrt(n_states))), '-')

        for s in range(n_states):
            q_values = np.zeros(n_actions)

            # Compute Q(s, a) for all actions
            for a in range(n_actions):
                for prob, next_state, reward, done in env.P[s][a]:
                    q_values[a] += prob * (reward + gamma * V[next_state])

            max_value = np.max(q_values)
            delta = max(delta, abs(max_value - V[s]))
            V[s] = max_value

            if np.max(q_values) > 0:
                policy_matrix[s // int(np.sqrt(n_states)), s % int(np.sqrt(n_states))] = directions[np.argmax(q_values)]

        print("\nPolicy directions after iteration:")
        print(policy_matrix)

        if delta < theta:
            break

    # Extract policy
    policy = np.zeros(n_states, dtype=int)
    policy_matrix = np.full((int(np.sqrt(n_states)), int(np.sqrt(n_states))), '-')

    for s in range(n_states):
        q_values = np.zeros(n_actions)

        for a in range(n_actions):
            for prob, next_state, reward, done in env.P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state])

        policy[s] = np.argmax(q_values)
        policy_matrix[s // int(np.sqrt(n_states)), s % int(np.sqrt(n_states))] = directions[policy[s]]

    print("\nFinal Optimal Policy Directions:")
    print(policy_matrix)

    return policy, V

# Create the Frozen Lake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Run Value Iteration
optimal_policy, optimal_values = value_iteration(env)

# Display Results
print("\nOptimal Policy:")
print(optimal_policy)
print("\nOptimal State-Value Function:")
print(optimal_values)



Policy directions after iteration:
[['-' '-' '-' '-']
 ['-' '-' '-' '-']
 ['-' '-' '-' '-']
 ['-' '-' 'v' '-']]

Policy directions after iteration:
[['-' '-' '-' '-']
 ['-' '-' '-' '-']
 ['-' '-' '<' '-']
 ['-' 'v' 'v' '-']]

Policy directions after iteration:
[['-' '-' '-' '-']
 ['-' '-' '<' '-']
 ['-' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['-' '-' '<' '<']
 ['-' '-' '<' '-']
 ['v' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['-' 'v' '<' '^']
 ['<' '-' '<' '-']
 ['^' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['v' '^' '>' '^']
 ['<' '-' '<' '-']
 ['^' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['v' '^' '<' '^']
 ['<' '-' '<' '-']
 ['^' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['v' '^' '<' '^']
 ['<' '-' '<' '-']
 ['^' 'v' '<' '-']
 ['-' '>' 'v' '-']]

Policy directions after iteration:
[['>' '^' '<' '^']
 ['<' '-' '<' '-']
 ['^' 'v' '<' '-']
 ['