## Frozen Lake

importing libraries

In [163]:
import numpy as np
import gymnasium as gym
import random
from tqdm import tqdm
import imageio
import matplotlib.pyplot as plt

We can have two sizes of environment:
- `map_name="4x4"`: a 4x4 grid version
- `map_name="8x8"`: a 8x8 grid version

The environment has two modes:
- `is_slippery=False`: The agent always moves in the intended direction due to the non-slippery nature of the frozen lake (deterministic).
- `is_slippery=True`: The agent may not always move in the intended direction due to the slippery nature of the frozen lake (stochastic).

Creating the FrozenLake-v1 environment using:
- 4x4 map.
- non-slippery version.
- render_mode = "rgb_array"

In [164]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False, render_mode="rgb_array")

You can create your own custom grid like this:<br>
`desc=["SFFF", "FHFH", "FFFH", "HFFG"]`<br>
`gym.make('FrozenLake-v1', desc=desc, is_slippery=True)`

Let's see what the environment looks like:

In [165]:
print("_____OBSERVATION SPACE_____ ")
print(f"Observation Space: {env.observation_space}")
print(f"Sample Observation: {env.observation_space.sample()}")  # get a random observation

_____OBSERVATION SPACE_____ 
Observation Space: Discrete(64)
Sample Observation: 53


We see with Observation Space Shape Discrete(16) that the observation is an integer representing the agent’s current position as current_row * nrows + current_col (where both the row and col start at 0).

In [166]:
print(" _____ACTION SPACE_____ ")
print(f"Action Space Shape: {env.action_space.n}")
print(f"Action Space Sample: {env.action_space.sample()}")  # take a random action

 _____ACTION SPACE_____ 
Action Space Shape: 4
Action Space Sample: 3


The action space (the set of possible actions the agent can take) is discrete with 4 actions available:
- 0: GO LEFT
- 1: GO DOWN
- 2: GO RIGHT
- 3: GO UP

Reward function:
- Reach goal: +1
- Reach hole: 0
- Reach frozen tile: 0

#### Create and Initialize Q-Table

In [167]:
state_space = env.observation_space.n
print(f"There are {state_space} possible states (rows)")

action_space = env.action_space.n
print(f"There are {action_space} possible actions (columns)")

There are 64 possible states (rows)
There are 4 possible actions (columns)


Creating Q-Table of size (state_space, action_space) and initialize each value = 0

In [168]:
def initialize_q_table(state_space, action_space):
    Qtable = np.zeros((state_space, action_space))
    return Qtable

Qtable_FrozenLake = initialize_q_table(state_space, action_space)

#### Defining the Greedy Policy

In [169]:
def greedy_policy(Qtable, state):
    # Exploitation, take the action with the highest state-action value.
    action = np.argmax(Qtable[state][:])
    return action

#### Defining Epsilon Greedy Policy

In [170]:
def epsilon_greedy_policy(Qtable, state, epsilon):
    # Randomly generate a number from 0 to 1
    random_num = random.uniform(0, 1)
    # if random_num greater than epsilon --> exploitation
    if random_num > epsilon:
        # Take the action with the highest value given a state
        action = greedy_policy(Qtable, state)
    # else --> exploration
    else: 
        action = env.action_space.sample()
    
    return action

#### Defining the HyperParameters

In [171]:
# Training Parameters
n_training_episodes = 400000     # total training episodes
learning_rate = 0.8             # learning rate

# Environment Parameters
env_id = "FrozenLake-v1"        # name of the environment
max_steps = 100                 # Max steps per episode
gamma = 0.90                    # Discounting Rate

# Exploration Parameters
max_epsilon = 1.0               # Exploration Probability at start
min_epsilon = 0.05              # Minimum Exploration Probabilty
decay_rate = 0.00001             # exponential decay rate for exploration probabilty

save_video=True
save_frequency = 100000
fps = 3

#### Creating the training loop method

In [172]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable, save_video, save_frequency, fps):

    images = []

    for episode in tqdm(range(n_training_episodes)):
        # Reduce epsilon because we need less and less exploration as we proceed
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)

        # Reset the environment
        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        if save_video and (episode%save_frequency==0 or episode == n_training_episodes-1):
            img = env.render()
            images.append(img)

        # repeat
        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            action = epsilon_greedy_policy(Qtable, state, epsilon)

            # Take action At and observe state s' and reward r
            new_state, reward, terminated, truncated, info = env.step(action)

            # Update Q(s,a) := Q(s,a) + lr * [R(s,a) + gamma * max(Q(s',a')) - Q(s,a)]
            Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]) 

            if save_video and (episode%save_frequency==0 or episode == n_training_episodes-1):
                img = env.render()
                images.append(img)

            # If terminated or truncated, finish the episode
            if terminated or truncated:
                break

            # Our next state is the new state
            state = new_state

        if save_video and (episode%save_frequency==0 or episode == n_training_episodes-1):
            plt.close()  # Close the plot after saving each frame
            vid_name = './Videos/training' + f'{episode}' + '.mp4'
            imageio.mimsave(vid_name, images, fps=fps)  # Save the images as a video file
            images = []

    return Qtable

#### Train the Q-Learning agent

In [173]:
Qtable_FrozenLake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_FrozenLake, save_video, save_frequency, fps)

100%|██████████| 400000/400000 [02:45<00:00, 2419.19it/s]


In [174]:
Qtable_FrozenLake

array([[0.22876792, 0.25418658, 0.25418658, 0.22876792],
       [0.22876792, 0.28242954, 0.28242954, 0.25418658],
       [0.25418658, 0.3138106 , 0.3138106 , 0.28242954],
       [0.28242954, 0.34867844, 0.34867844, 0.3138106 ],
       [0.3138106 , 0.38742049, 0.38742049, 0.34867844],
       [0.34867844, 0.43046721, 0.43046721, 0.38742049],
       [0.38742049, 0.4782969 , 0.4782969 , 0.43046721],
       [0.43046721, 0.531441  , 0.4782969 , 0.4782969 ],
       [0.25418658, 0.28242954, 0.28242954, 0.22876792],
       [0.25418658, 0.3138106 , 0.3138106 , 0.25418658],
       [0.28242954, 0.34867844, 0.34867844, 0.28242954],
       [0.3138106 , 0.        , 0.38742049, 0.3138106 ],
       [0.34867844, 0.43046721, 0.43046721, 0.34867844],
       [0.38742049, 0.4782969 , 0.4782969 , 0.38742049],
       [0.43046721, 0.531441  , 0.531441  , 0.43046721],
       [0.4782969 , 0.59049   , 0.531441  , 0.4782969 ],
       [0.28242954, 0.3138106 , 0.3138106 , 0.25418658],
       [0.28242954, 0.34867844,

In [175]:
# def record_video(env, Qtable, out_directory, fps=1):
#   """
#   Generate a replay video of the agent
#   :param env
#   :param Qtable: Qtable of our agent
#   :param out_directory
#   :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
#   """
#   images = []  
#   terminated = False
#   truncated = False
#   state, info = env.reset(seed=random.randint(0,500))
#   img = env.render()
#   images.append(img)
#   while not terminated or truncated:
#     # Take the action (index) that have the maximum expected future reward given that state
#     action = np.argmax(Qtable[state][:])
#     state, reward, terminated, truncated, info = env.step(action) # We directly put next_state = state for recording logic
#     img = env.render()
#     images.append(img)
#   imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [176]:
# out_directory=f"./Videos/final.mp4"
# record_video(env, Qtable_FrozenLake, out_directory, 2)