In [46]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict # allows access to undefined keys
matplotlib.use('TkAgg')  # or 'Qt5Agg' if you prefer Qt

In [47]:
class BlackjackAgent():
    def __init__(self,
                 learning_rate:float,
                 initial_epsilon:float,
                 epsilon_decay:float,
                 final_epsilon:float,
                 discount_factor:float = 0.95,
                 ):
        
    #Initialize the agent with empty dictionary of action/state values (q_values), a learning rate and an epsilon
    # discount_factor : Is for computing the Q-value namely gamma 
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
    
        self.lr = learning_rate
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon
        self.discount_factor = discount_factor
        
        self.training_error = []
    
    def choose_action(self, obs:tuple[int,int,bool])->int:
        # Return the best action with a probability of (1- epsilon) 
        if np.random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
    
    def update_q_values(self,
                        obs:tuple[int,int,bool],
                        action:int,
                        reward:float,
                        terminated:bool,
                        next_obs:tuple[int,int,bool]):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])

        temporal_diffrence = (reward + (self.discount_factor * future_q_value))- self.q_values[obs][action]
        
        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_diffrence
        )
        self.training_error.append(temporal_diffrence)
        
    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

In [48]:
learning_rate = 0.01
n_episodes = 100_00
start_epsilon = 1
epsilon_decay = 0.80
final_epsilon = 0.05

agent = BlackjackAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    final_epsilon=final_epsilon,
    epsilon_decay=epsilon_decay,
    
    
)

In [49]:
env = gym.make("Blackjack-v1", sab=True)


In [50]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
rewards = 0 
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    
    # play one episode
    while not done:
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        # update the agent
        agent.update_q_values(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 10000/10000 [00:01<00:00, 5322.98it/s]


In [51]:
print(len(agent.q_values))


359


In the Blackjack environment, the state space is defined by three components:

- The player's current sum (ranges from 4 to 21)
- The dealer's visible card (ranges from 1 to 10, where 1 represents an Ace)
- Whether the player has a usable Ace (True or False)

So, the total number of possible states is:
(21 - 4 + 1) * 10 * 2 = 18 * 10 * 2 = 360
However, you're seeing 380 instead of 360. This is because the environment also includes some terminal states that can occur when the player's sum exceeds 21 (bust states). These additional states account for the extra 20 entries in your q_values dictionary.

In [52]:
rolling_length = 500
fig, axs = plt.subplots(ncols=3, figsize=(12, 5))
axs[0].set_title("Episode rewards")
# compute and assign a rolling average of the data to provide a smoother graph
reward_moving_average = (
    np.convolve(
        np.array(env.return_queue).flatten(), np.ones(rolling_length), mode="valid"
    )
    / rolling_length
)
axs[0].plot(range(len(reward_moving_average)), reward_moving_average)
axs[1].set_title("Episode lengths")
length_moving_average = (
    np.convolve(
        np.array(env.length_queue).flatten(), np.ones(rolling_length), mode="same"
    )
    / rolling_length
)
axs[1].plot(range(len(length_moving_average)), length_moving_average)
axs[2].set_title("Training Error")
training_error_moving_average = (
    np.convolve(np.array(agent.training_error), np.ones(rolling_length), mode="same")
    / rolling_length
)
axs[2].plot(range(len(training_error_moving_average)), training_error_moving_average)
plt.tight_layout()
plt.show()

In [53]:
print(f'total rewards = {rewards}')


total rewards = -1009.0


In [54]:
env = gym.make("Blackjack-v1", render_mode="rgb_array")
obs, info = env.reset()

plt.ion()
fig, ax = plt.subplots(figsize=(8,8))
action_text = ax.text(510, 20, '', color='white', fontsize=12, bbox=dict(facecolor='blue', alpha=0.8))
actions = ['Stick','Hit']
img = ax.imshow(env.render())
rewards = 0
num_epochs= 10
for step in range(num_epochs):
    obs, info = env.reset()
    done = False
    while not done:
        print(f'step {step}:  obs = {obs}')
        action = agent.choose_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        rewards += reward
        frame = env.render()
        img.set_data(frame)
        action_text.set_text(f'Step: {actions[action] }')

        fig.canvas.draw()
        fig.canvas.flush_events()
        plt.pause(.5)
        done = terminated or truncated
        obs = next_obs

plt.ioff()  # Turn off interactive mode
# plt.show()  # Keep the window open after the animation finishes
plt.close()
env.close()

step 0:  obs = (13, 5, 1)
step 0:  obs = (21, 5, 1)
step 1:  obs = (13, 3, 0)
step 1:  obs = (15, 3, 0)
step 2:  obs = (20, 8, 0)
step 3:  obs = (12, 5, 0)
step 3:  obs = (20, 5, 0)
step 4:  obs = (12, 7, 0)
step 4:  obs = (19, 7, 0)
step 5:  obs = (13, 2, 0)
step 6:  obs = (17, 8, 0)
step 7:  obs = (15, 10, 1)
step 8:  obs = (21, 1, 1)
step 9:  obs = (17, 3, 0)


In [55]:
print(f'total rewards = {rewards}')

total rewards = 2.0


In [56]:
print(f'action space shape : {env.action_space.n}') # Number of possible actions is 4
print(f'observation space shape : {env.observation_space}') 
#-------------- obesrvation is a tupe of 3 values : --------------
#1) player cards value
#2) dealer's face up card
#3) usable ace for player, equal 1 if ace is considered an 11 without busting

print(f'reward range : {env.reward_range}') # default reward range is set to -inf +inf
# print(f'\nEnv specs : {env.spec}') 
print(f'\nEnv metadata : {env.metadata}') # render_modes adn render_fps

action space shape : 2
observation space shape : Tuple(Discrete(32), Discrete(11), Discrete(2))
reward range : (-inf, inf)

Env metadata : {'render_modes': ['human', 'rgb_array'], 'render_fps': 4}
