In [None]:
#! pip install 'gymnasium[FrozenLake-v1]'
#!pip install gymnasium --upgrade
#! gymnasium install frozenlake-v1
!pip install pygame
!pip install gymnasium

In [3]:
# Import libraries,
import random
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import IPython
from IPython import display
import time

# MonteCarlo is used in the policy_evaluation function to estimate 
#the state value function V for a given policy. It performs iterative updates based on the 
#Bellman equation, updating the value estimates for each state based on the estimated values of 
#its successor states.
def policy_evaluation(env, policy, num_episodes, max_episode_length, GAMMA):
    num_states = env.observation_space.n
    print('----policy_evaluation.num_states----', num_states)
    V = np.zeros(num_states)
    print('----policy_evaluation.V----',V)
    for episode in range(num_episodes):
        state = env.reset()
        episode_rewards = []

        for t in range(max_episode_length):
            if isinstance(state, tuple):
                state = state[0]
            action = policy[state]
            next_state, reward, terminated, truncated, info = env.step(action)
            episode_rewards.append(reward)

            if terminated:
                break

            state = next_state

        G = 0
        for t in reversed(range(len(episode_rewards))):
            reward = episode_rewards[t]
            G = GAMMA * G + reward
            V[state] = (V[state] * t + G) / (t + 1)
    print('----policy_evaluation.V----',V)
    return V

#Monte Carlo methods are used in the policy_improvement function to estimate the action 
#value function Q(s, a) for each state-action pair. It interacts with the environment multiple times, 
#collecting episodes of experience, and then estimates the Q-values based on the cumulative rewards 
#obtained for each action in each state.

def policy_improvement(env, V, num_episodes, max_episode_length):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    policy = np.zeros(num_states, dtype=int)

    for state in range(num_states):
        q_values = np.zeros(num_actions)

        for action in range(num_actions):
            episode_rewards = []

            for _ in range(num_episodes):
                current_state = env.reset()

                for _ in range(max_episode_length):
#                     print('----current_state---',current_state)
                    if isinstance(current_state, tuple):
                        current_state = current_state[0]
                    if current_state == state:
                        next_state, reward, terminated, truncated, info = env.step(action)
                        episode_rewards.append(reward)
                        current_state = next_state
                        if terminated:
                            break
                    else:
                        _, _, terminated, _, _ = env.step(0)
                        if terminated:
                            break
            q_values[action] = sum(episode_rewards)

        policy[state] = np.argmax(q_values)

    return policy

def animate_gameplay(env, policy):
    state = env.reset()
    episode_reward = 0
    
    plt.figure()
    img = plt.imshow(env.render())
    plt.axis('off')
    
    while True:
        if isinstance(state, tuple):
            state = state[0]
        action = policy[state]
        next_state, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
        img.set_data(env.render())
        IPython.display.display(plt.gcf())
        IPython.display.clear_output(wait=True)
        time.sleep(0.9)  # Add a small delay between frames
        
        state = next_state
    
    plt.close()
    print("Episode Reward:", episode_reward)
    
def main():
    # Create the environment
    env = gym.make('FrozenLake-v1', render_mode='rgb_array')

    # Initialize the value function and policy
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    V = np.zeros(num_states)
    
    policy = np.random.randint(0, num_actions, size=num_states)
    print('-----main.policy----',policy)
    # Define the number of episodes and maximum episode length
    num_episodes = 1000
    max_episode_length = 10
    GAMMA = 0.9
    
    # Initialize the policy randomly
    num_states = env.observation_space.n
    policy = np.random.randint(0, env.action_space.n, size=num_states)

    # Policy Evaluation
    V = policy_evaluation(env, policy, num_episodes, max_episode_length, GAMMA)

    # Policy Improvement
    policy = policy_improvement(env, V, num_episodes, max_episode_length)
   
    # Print the final value function and policy
    # The output of value function resemble the frozen lake 
    print("Value Function:")
    print(V)
    print("Policy:")
    print(policy)
    
    #animate_gameplay(env, policy)
   

In [4]:
if __name__ == "__main__":
  main()

-----main.policy---- [3 3 0 0 2 0 2 1 1 0 1 3 1 2 0 0]
----policy_evaluation.num_states---- 16
----policy_evaluation.V---- [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
----policy_evaluation.V---- [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Value Function:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Policy:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
