In [8]:
import gymnasium as gym
import numpy as np
import time

def policy_iteration(env, discount_factor=0.99, theta=1e-8, max_iterations=1000):
    def policy_evaluation(policy, V, theta=1e-8):
        while True:
            delta = 0
            for s in range(env.observation_space.n):
                v = V[s]
                V[s] = sum([p * (r + discount_factor * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][policy[s]]])
                delta = max(delta, abs(v - V[s]))
            if delta < theta:
                break
        return V

    def policy_improvement(policy, V):
        policy_stable = True
        for s in range(env.observation_space.n):
            old_action = policy[s]
            policy[s] = max(range(env.action_space.n), 
                            key=lambda a: sum([p * (r + discount_factor * V[s_]) 
                                               for p, s_, r, _ in env.unwrapped.P[s][a]]))
            if old_action != policy[s]:
                policy_stable = False
        return policy, policy_stable

    policy = np.zeros(env.observation_space.n, dtype=int)
    V = np.zeros(env.observation_space.n)

    for i in range(max_iterations):
        V = policy_evaluation(policy, V, theta)
        policy, policy_stable = policy_improvement(policy, V)
        if policy_stable:
            break

    return policy, V

def value_iteration(env, discount_factor=0.99, theta=1e-8, max_iterations=1000):
    V = np.zeros(env.observation_space.n)
    for i in range(max_iterations):
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            V[s] = max([sum([p * (r + discount_factor * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]])
                        for a in range(env.action_space.n)])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    policy = np.zeros(env.observation_space.n, dtype=int)
    for s in range(env.observation_space.n):
        policy[s] = max(range(env.action_space.n), 
                        key=lambda a: sum([p * (r + discount_factor * V[s_]) 
                                           for p, s_, r, _ in env.unwrapped.P[s][a]]))
    return policy, V

def run_episodes(env, policy, num_episodes=1000):
    wins = 0
    total_reward = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy[state]
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
        if reward == 1:
            wins += 1
    return wins, total_reward / num_episodes

def compare_methods(env_name, discount_factor=0.99, theta=1e-8, max_iterations=1000, num_episodes=1000):
    env = gym.make(env_name)

    print(f"Running Policy Iteration...")
    start_time = time.time()
    pi_policy, _ = policy_iteration(env, discount_factor, theta, max_iterations)
    pi_time = time.time() - start_time
    pi_wins, pi_avg_reward = run_episodes(env, pi_policy, num_episodes)

    print(f"Running Value Iteration...")
    start_time = time.time()
    vi_policy, _ = value_iteration(env, discount_factor, theta, max_iterations)
    vi_time = time.time() - start_time
    vi_wins, vi_avg_reward = run_episodes(env, vi_policy, num_episodes)

    print("\nResults:")
    print(f"Policy Iteration - Wins: {pi_wins}/{num_episodes}, Avg Reward: {pi_avg_reward:.4f}, Time: {pi_time:.4f}s")
    print(f"Value Iteration  - Wins: {vi_wins}/{num_episodes}, Avg Reward: {vi_avg_reward:.4f}, Time: {vi_time:.4f}s")

    if pi_wins > vi_wins:
        print("\nPolicy Iteration performed better in terms of wins.")
    elif vi_wins > pi_wins:
        print("\nValue Iteration performed better in terms of wins.")
    else:
        print("\nBoth methods performed equally in terms of wins.")

    if pi_avg_reward > vi_avg_reward:
        print("Policy Iteration achieved higher average reward.")
    elif vi_avg_reward > pi_avg_reward:
        print("Value Iteration achieved higher average reward.")
    else:
        print("Both methods achieved equal average reward.")

    if pi_time < vi_time:
        print("Policy Iteration was faster.")
    elif vi_time < pi_time:
        print("Value Iteration was faster.")
    else:
        print("Both methods took equal time.")

if __name__ == "__main__":
    compare_methods("FrozenLake-v1", discount_factor=0.99, theta=1e-8, max_iterations=1000, num_episodes=1000)


Running Policy Iteration...
Running Value Iteration...

Results:
Policy Iteration - Wins: 726/1000, Avg Reward: 0.7260, Time: 0.1034s
Value Iteration  - Wins: 744/1000, Avg Reward: 0.7440, Time: 0.1195s

Value Iteration performed better in terms of wins.
Value Iteration achieved higher average reward.
Policy Iteration was faster.


In [None]:
import gymnasium as gym
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

def policy_iteration(env, discount_factor=0.99, theta=1e-8, max_iterations=1000):
    def policy_evaluation(policy, V, theta=1e-8):
        while True:
            delta = 0
            for s in range(env.observation_space.n):
                v = V[s]
                V[s] = sum([p * (r + discount_factor * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][policy[s]]])
                delta = max(delta, abs(v - V[s]))
            if delta < theta:
                break
        return V

    def policy_improvement(policy, V):
        policy_stable = True
        for s in range(env.observation_space.n):
            old_action = policy[s]
            policy[s] = max(range(env.action_space.n), 
                            key=lambda a: sum([p * (r + discount_factor * V[s_]) 
                                               for p, s_, r, _ in env.unwrapped.P[s][a]]))
            if old_action != policy[s]:
                policy_stable = False
        return policy, policy_stable

    policy = np.zeros(env.observation_space.n, dtype=int)
    V = np.zeros(env.observation_space.n)

    for i in range(max_iterations):
        V = policy_evaluation(policy, V, theta)
        policy, policy_stable = policy_improvement(policy, V)
        if policy_stable:
            break

    return policy, V

def value_iteration(env, discount_factor=0.99, theta=1e-8, max_iterations=1000):
    V = np.zeros(env.observation_space.n)
    for i in range(max_iterations):
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            V[s] = max([sum([p * (r + discount_factor * V[s_]) for p, s_, r, _ in env.unwrapped.P[s][a]])
                        for a in range(env.action_space.n)])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    policy = np.zeros(env.observation_space.n, dtype=int)
    for s in range(env.observation_space.n):
        policy[s] = max(range(env.action_space.n), 
                        key=lambda a: sum([p * (r + discount_factor * V[s_]) 
                                           for p, s_, r, _ in env.unwrapped.P[s][a]]))
    return policy, V

def run_episodes(env, policy, num_episodes=1000):
    wins = 0
    total_reward = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy[state]
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
        if reward == 1:
            wins += 1
    return wins, total_reward / num_episodes

def plot_value_function(V, title, shape=(4, 4)):
    plt.figure(figsize=(6, 6))
    sns.heatmap(V.reshape(shape), annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
    plt.title(title)
    plt.show()

def compare_methods(env_name, discount_factor=0.99, theta=1e-8, max_iterations=1000, num_episodes=1000):
    env = gym.make(env_name)
    
    print(f"Running Policy Iteration...")
    start_time = time.time()
    pi_policy, pi_V = policy_iteration(env, discount_factor, theta, max_iterations)
    pi_time = time.time() - start_time
    pi_wins, pi_avg_reward = run_episodes(env, pi_policy, num_episodes)
    
    print(f"Running Value Iteration...")
    start_time = time.time()
    vi_policy, vi_V = value_iteration(env, discount_factor, theta, max_iterations)
    vi_time = time.time() - start_time
    vi_wins, vi_avg_reward = run_episodes(env, vi_policy, num_episodes)
    
    # Plot value functions
    plot_value_function(pi_V, "Policy Iteration Value Function")
    plot_value_function(vi_V, "Value Iteration Value Function")
    
    # Bar chart comparison
    metrics = ['Wins', 'Avg Reward', 'Time Taken']
    pi_metrics = [pi_wins, pi_avg_reward, pi_time]
    vi_metrics = [vi_wins, vi_avg_reward, vi_time]
    x = np.arange(len(metrics))
    
    plt.figure(figsize=(8, 5))
    plt.bar(x - 0.2, pi_metrics, 0.4, label='Policy Iteration', color='blue')
    plt.bar(x + 0.2, vi_metrics, 0.4, label='Value Iteration', color='red')
    plt.xticks(x, metrics)
    plt.ylabel("Performance")
    plt.title("Comparison of Policy Iteration vs Value Iteration")
    plt.legend()
    plt.show()
    
    # Print results
    print("\nResults:")
    print(f"Policy Iteration - Wins: {pi_wins}/{num_episodes}, Avg Reward: {pi_avg_reward:.4f}, Time: {pi_time:.4f}s")
    print(f"Value Iteration  - Wins: {vi_wins}/{num_episodes}, Avg Reward: {vi_avg_reward:.4f}, Time: {vi_time:.4f}s")

    if pi_wins > vi_wins:
        print("\nPolicy Iteration performed better in terms of wins.")
    elif vi_wins > pi_wins:
        print("\nValue Iteration performed better in terms of wins.")
    else:
        print("\nBoth methods performed equally in terms of wins.")

    if pi_avg_reward > vi_avg_reward:
        print("Policy Iteration achieved higher average reward.")
    elif vi_avg_reward > pi_avg_reward:
        print("Value Iteration achieved higher average reward.")
    else:
        print("Both methods achieved equal average reward.")

    if pi_time < vi_time:
        print("Policy Iteration was faster.")
    elif vi_time < pi_time:
        print("Value Iteration was faster.")
    else:
        print("Both methods took equal time.")

if __name__ == "__main__":
    compare_methods("FrozenLake-v1", discount_factor=0.99, theta=1e-8, max_iterations=1000, num_episodes=1000)


Running Policy Iteration...
