In [1]:
# link of tutorial
# https://gymnasium.farama.org/introduction/train_agent/

In [2]:
# training an agent to play blackjack
from collections import defaultdict
import gymnasium as gym
import numpy as np

In [8]:
from gymnasium.core import Env
class BlackjackAgent:
  def __init__(
                 self,
                 env: gym.Env,
                 learning_rate:float,
                 initial_epsilon:float,
                 epsilon_decay:float,
                 final_epsilon:float,
                 discount_factor:float = 0.95):

    self.env = env

    # deafultdict automatically creates entries with zeros for new states
    self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

    self.lr = learning_rate
    self.discount_factor = discount_factor

    # Exploration parameters
    self.epsilon = initial_epsilon
    self.epsilon_decay = epsilon_decay
    self.final_decay = final_epsilon

    # track learning process
    self.training_error = []

  def get_action(self, obs: tuple[int, int, bool]) -> int:
    """
       obs[int, int , bool] {
          int - player's sum of cards
          int - dealer's showing card
          bool - usable ace
       }

       Returns:
             action : 0 (stand) or 1(hit)
     """
    if np.random.random() < self.epsilon:
      return self.env.action_space.sample()

    else:
      return int(np.argmax(self.q_values[obs]))


  def update_qvalues(
              self,
              obs:tuple[int, int, bool],
              action:int,
              reward:float,
              terminate:bool,
              next_obs:tuple[int, int, bool]
             ):
    """   """
    # Zero if episode reminate - no future rewards possible
    future_q_value = (not terminate) * np.max(self.q_values[next_obs])

    # target Q-value using Bellman equation
    target = reward + self.discount_factor * future_q_value

    # how wrong was our current estimators
    temporal_difference = target - self.q_values[obs][action]

    # update our estimate
    self.q_values[obs][action] = (
        self.q_values[obs][action] + self.lr * temporal_difference
    )

    # track learning progress
    self.training_error.append(temporal_difference)

  def decay_epsilon(self):
    self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)