In [259]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

In [274]:
env = gym.make("Blackjack-v1",natural=False, sab=False)

In [261]:
class BlackjackAgent:
    def __init__(
        self,
        env,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, env, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)
    def predict(self,obs):
        return self.q_values[obs]



In [None]:
learning_rate = 0.01
n_episodes = 50_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = BlackjackAgent(
    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [289]:
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=n_episodes)
for episode in range(n_episodes):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(env, obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

KeyboardInterrupt: 

In [299]:
wins = 0
losses = 0
draws = 0
games = 10000
for _ in range(games):
    state, _ = env.reset()
    done = False
    while not done:
        action = np.argmax(agent.predict(state))
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

    if reward == 1:
        wins += 1
    elif reward == -1:
        losses += 1
    else:
        draws += 1

print("Win rate:", wins / games * 100)
print("Loss rate:", losses / games * 100)
print("Draw rate:", draws / games * 100)
print("Expected value:", (wins - losses) / games * 100)


Win rate: 43.54
Loss rate: 46.77
Draw rate: 9.69
Expected value: -3.2300000000000004


In [None]:
terminated = False
truncated = False
env = gym.make("Blackjack-v1",natural=False, sab=False", render_mode="human")
state = env.reset()
action = env.action_space.sample()
totalReward = 0
while (not terminated and not truncated):
    obs,reward ,terminated,truncated,info = env.step(action)
    totalReward +=reward
    action = model.predict(obs)[0]
    env.render()

defaultdict(<function __main__.BlackjackAgent.__init__.<locals>.<lambda>()>,
            {(17, 2, 1): array([-0.11620083,  0.0521771 ]),
             (18, 2, 1): array([0.22525827, 0.03503654]),
             (14, 10, 0): array([-0.58377607, -0.46035271]),
             (16, 10, 0): array([-0.7182416, -0.6139262]),
             (13, 10, 0): array([-0.60803243, -0.4471321 ]),
             (26, 10, 0): array([0., 0.]),
             (24, 2, 0): array([0., 0.]),
             (18, 2, 0): array([ 0.10615813, -0.68980638]),
             (17, 2, 0): array([-0.19679569, -0.58107115]),
             (13, 2, 0): array([-0.19798781, -0.33003834]),
             (22, 2, 0): array([0., 0.]),
             (19, 10, 0): array([-0.1058305 , -0.73722285]),
             (13, 7, 0): array([-0.48248027, -0.36766804]),
             (21, 7, 1): array([0.88753225, 0.34598364]),
             (15, 1, 0): array([-0.81395083, -0.61364243]),
             (8, 5, 0): array([-0.30896565,  0.06695337]),
             (25, 9