In [18]:
import gymnasium as gym
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array")

In [19]:
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())

Action Space Shape 4
Action Space Sample 2


In [20]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


In [21]:
import numpy as np
def initialize_q_table(state_space, action_space):
    Qtable = np.zeros((state_space, action_space))
    return Qtable

In [22]:
Qtable_frozenlake = initialize_q_table(state_space, action_space)

In [23]:
Qtable_frozenlake

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [24]:
def greedy_policy(Qtable, state):
    # Exploitation: take the action with the highest state, action value
    action = np.argmax(Qtable[state][:])

    return action

In [25]:
import random
def epsilon_greedy_policy(Qtable, state, epsilon):
    # Randomly generate a number between 0 and 1
    random_num = random.uniform(0, 1)
    # if random_num > greater than epsilon --> exploitation
    if random_num > epsilon:
        # Take the action with the highest value given a state
        # np.argmax can be useful here
        action = greedy_policy(Qtable, state)
    # else --> exploration
    else:
        action = env.action_space.sample()

    return action

In [26]:
# Training parameters
n_training_episodes = 100  # Total training episodes
learning_rate = 0.7  # Learning rate

# Environment parameters
max_steps = 99  # Max steps per episode
gamma = 0.95  # Discounting rate

# Exploration parameters
max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

In [27]:
import imageio

In [28]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    images=[]
    for episode in range(n_training_episodes):
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        # Reset the environment
        state, info = env.reset()
        img = env.render()
        images.append(img)
        step = 0
        terminated = False
        truncated = False

        # repeat
        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            action = epsilon_greedy_policy(Qtable, state, epsilon)

            # Take action At and observe Rt+1 and St+1
            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, terminated, truncated, info = env.step(action)
            img = env.render()
            images.append(img)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            Qtable[state][action] = Qtable[state][action] + learning_rate * (
                reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]
            )

            # If terminated or truncated finish the episode
            if terminated or truncated:
                break

            # Our next state is the new state
            state = new_state
    imageio.mimsave("training.png", [np.array(img) for i, img in enumerate(images)], fps=10)
    return Qtable

In [29]:
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

In [30]:
Qtable_frozenlake

array([[0.35871885, 0.37761243, 0.34072933, 0.35858203],
       [0.35870311, 0.        , 0.31685548, 0.33593841],
       [0.33371125, 0.19943181, 0.28918576, 0.21796452],
       [0.30704981, 0.        , 0.17311762, 0.2456001 ],
       [0.36972831, 0.39748932, 0.        , 0.35861415],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.25893099],
       [0.        , 0.        , 0.        , 0.        ],
       [0.39647256, 0.        , 0.41851618, 0.37378631],
       [0.27630958, 0.44190262, 0.        , 0.        ],
       [0.        , 0.60515   , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.3095575 , 0.7448    , 0.40885008],
       [0.3095575 , 0.        , 0.973     , 0.        ],
       [0.        , 0.        , 0.        , 0.        ]])

In [31]:
images = []
terminated = False
truncated = False
state, info = env.reset()
img = env.render()
images.append(img)
while not terminated or truncated:
    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(Qtable_frozenlake[state][:])
    state, reward, terminated, truncated, info = env.step(
        action
    )  # We directly put next_state = state for recording logic
    img = env.render()
    images.append(img)

In [32]:
imageio.mimsave("test.png", [np.array(img) for i, img in enumerate(images)], fps=5)