In [1]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import FrozenLakeEnv
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict # allows access to undefined keys
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt

In [2]:


class CustomFrozenLake(FrozenLakeEnv):
    def __init__(self, goal_reward=100, hole_penalty=-50, step_penalty=-1,stuck_penalty=-1, **kwargs):
        super().__init__(**kwargs)
        self.goal_reward = goal_reward
        self.hole_penalty = hole_penalty
        self.step_penalty = step_penalty
        self.stuck_penalty = stuck_penalty

    def step(self, action):
        prev_state = self.unwrapped.s
        
        state, reward, terminated, truncated, info = super().step(action)
        
        current_tile = self.desc[self.unwrapped.s // self.ncol, self.unwrapped.s % self.ncol]
        
        if current_tile in b'H':
            reward = self.hole_penalty  # Apply penalty for falling into a hole
        elif current_tile in b'G':
            reward = self.goal_reward  # Apply higher reward for reaching the goal
        
        elif prev_state == state :
            reward = self.stuck_penalty  # Apply small penalty for walking on frozen tiles
        else:
            reward = self.step_penalty
            
        
        return state, reward, terminated, truncated, info

In [3]:
class FrozenLakeAgent():
    def __init__(self,
                 learning_rate:float,
                 initial_epsilon:float,
                 epsilon_decay:float,
                 final_epsilon:float,
                 discount_factor:float = 0.95,
                 ):
        
    #Initialize the agent with empty dictionary of action/state values (q_values), a learning rate and an epsilon
    # discount_factor : Is for computing the Q-value namely gamma 
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
    
        self.lr = learning_rate
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor
        
        self.training_error = []
    
    def choose_action(self, obs:tuple[int,int,bool])->int:
        # Return the best action with a probability of (1- epsilon) 
        if np.random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
    
    def update_q_values(self,
                        obs:tuple[int,int,bool],
                        action:int,
                        reward:float,
                        terminated:bool,
                        next_obs:tuple[int,int,bool]):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])

        temporal_diffrence = (reward + (self.discount_factor * future_q_value))- self.q_values[obs][action]
        
        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_diffrence
        )
        self.training_error.append(temporal_diffrence)
        
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

In [4]:
learning_rate = .1
n_episodes = 10000
start_epsilon = 1
epsilon_decay = 0.999
final_epsilon = 0.05

agent = FrozenLakeAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    final_epsilon=final_epsilon,
    epsilon_decay=epsilon_decay,
    
    
)

In [5]:
env = CustomFrozenLake(map_name="8x8", is_slippery=False, render_mode='rgb_array')


In [6]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
env = gym.wrappers.TimeLimit(env, max_episode_steps=60)

rewards = 0 
for episode in tqdm(range(n_episodes)):
    
    obs, info = env.reset()
    done = False
    
    # play one episode
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        # update the agent
        agent.update_q_values(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 10000/10000 [00:06<00:00, 1478.52it/s]


In [7]:
q_values = np.array([value for key, value in agent.q_values.items()])
print(np.argmax(q_values,axis=1))


[2 2 2 2 1 2 1 1 0 1 2 1 1 1 1 1 2 0 2 2 0 1 2 1 1 1 2 0 2 3 0 2 3 0 1 1 1
 1 1 2 1 0 2 3 0 1 0 0 0 2 2 0 0 1 0 1 0 3 3 0 0 2 1 2]


In the Blackjack environment, the state space is defined by three components:

- The player's current sum (ranges from 4 to 21)
- The dealer's visible card (ranges from 1 to 10, where 1 represents an Ace)
- Whether the player has a usable Ace (True or False)

So, the total number of possible states is:
(21 - 4 + 1) * 10 * 2 = 18 * 10 * 2 = 360
However, you're seeing 380 instead of 360. This is because the environment also includes some terminal states that can occur when the player's sum exceeds 21 (bust states). These additional states account for the extra 20 entries in your q_values dictionary.

In [8]:
rolling_length = 500
fig, axs = plt.subplots(ncols=3, figsize=(12, 5))
axs[0].set_title("Episode rewards")
# compute and assign a rolling average of the data to provide a smoother graph
reward_moving_average = (
    np.convolve(
        np.array(env.return_queue).flatten(), np.ones(rolling_length), mode="valid"
    )
    / rolling_length
)
axs[0].plot(range(len(reward_moving_average)), reward_moving_average)
axs[1].set_title("Episode lengths")
length_moving_average = (
    np.convolve(
        np.array(env.length_queue).flatten(), np.ones(rolling_length), mode="same"
    )
    / rolling_length
)
axs[1].plot(range(len(length_moving_average)), length_moving_average)
axs[2].set_title("Training Error")
training_error_moving_average = (
    np.convolve(np.array(agent.training_error), np.ones(rolling_length), mode="same")
    / rolling_length
)
axs[2].plot(range(len(training_error_moving_average)), training_error_moving_average)
plt.tight_layout()
plt.show()

  logger.warn(
  logger.warn(


In [9]:
print(f'total rewards = {rewards}')


total rewards = 508898


In [11]:


# env = gym.make("FrozenLake-v1", render_mode="rgb_array")
# env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
img = ax.imshow(env.render())
actions = ['Move Up','Move Right','Move Down','Move Left']
rewards = 0
num_epochs= 3
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        
        print(f'step {step}:  obs = {next_obs} , reward = {reward}')
        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        plt.pause(.05)
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
# plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

step 0:  obs = 1 , reward = -1
step 0:  obs = 0 , reward = -1
step 0:  obs = 1 , reward = -1
step 0:  obs = 2 , reward = -1
step 0:  obs = 10 , reward = -1
step 0:  obs = 18 , reward = -1
step 0:  obs = 26 , reward = -1
step 0:  obs = 27 , reward = -1
step 0:  obs = 28 , reward = -1
step 0:  obs = 36 , reward = -1
step 0:  obs = 37 , reward = -1
step 0:  obs = 38 , reward = -1
step 0:  obs = 39 , reward = -1
step 0:  obs = 47 , reward = -1
step 0:  obs = 55 , reward = -1
step 0:  obs = 63 , reward = 100
step 1:  obs = 1 , reward = -1
step 1:  obs = 0 , reward = -1
step 1:  obs = 1 , reward = -1
step 1:  obs = 2 , reward = -1
step 1:  obs = 10 , reward = -1
step 1:  obs = 18 , reward = -1
step 1:  obs = 26 , reward = -1
step 1:  obs = 27 , reward = -1
step 1:  obs = 19 , reward = -50
step 2:  obs = 1 , reward = -1
step 2:  obs = 2 , reward = -1
step 2:  obs = 10 , reward = -1
step 2:  obs = 18 , reward = -1
step 2:  obs = 26 , reward = -1
step 2:  obs = 27 , reward = -1
step 2:  obs = 2

In [None]:
print(f'total rewards = {rewards}')

In [None]:
# print(f'action space shape : {env.action_space.n}') # Number of possible actions is 4
# print(f'observation space shape : {env.observation_space}') 
# #-------------- obesrvation is a tupe of 3 values : --------------
# #1) player cards value
# #2) dealer's face up card
# #3) usable ace for player, equal 1 if ace is considered an 11 without busting
# 
# print(f'reward range : {env.reward_range}') # default reward range is set to -inf +inf
# # print(f'\nEnv specs : {env.spec}') 
# print(f'\nEnv metadata : {env.metadata}') # render_modes adn render_fps