In [5]:
import random

# Blackjack Environment
class Blackjack:
    # The deck is represented as a list of cards
    # Each card is an integer from 2 to 11
    # The cards 2 to 10 have their face values
    # The face cards (Jack, Queen, King) and the Ace are valued as 10 and 11 respectively
    # The deck has 4 cards of each type of card, making a total of 52 cards
    # The Ace is considered as 11, if the total value of the hand is less than or equal to 21
    # Otherwise, it is considered as 1
    def __init__(self):
        self.deck = [2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 11] * 4
        self.player_hand = [] # Player's hand - list of cards
        self.dealer_hand = [] # Dealer's hand - list of cards

    # Draw a card from the deck and add it to the hand
    # Return the card that was drawn
    def draw_card(self, hand):
        card = random.choice(self.deck)
        hand.append(card)
        return card

    # Calcualte the value of the hand
    # If the total value of the hand is greater than 21,
    # and there are aces in the hand, then we convert
    # the value of the ace from 11 to 1
    def calculate_hand_value(self, hand):
        total = sum(hand)
        aces = hand.count(11)
        while total > 21 and aces:
            total -= 10
            aces -= 1
        return total

    # Start the game by drawing two cards for the player and dealer
    # Return the state of the game which is a tuple of the player's hand value,
    # the dealer's face-up card, and the number of aces in the player's hand
    def start_game(self):
        self.player_hand = []
        self.dealer_hand = []
        self.draw_card(self.player_hand)
        self.draw_card(self.dealer_hand)
        self.draw_card(self.player_hand)
        self.draw_card(self.dealer_hand)
        return self.get_state()

    # Return the state of the game which is a tuple of the player's hand value,
    # the dealer's face-up card, and the number of aces in the player's hand
    def get_state(self):
        return (self.calculate_hand_value(self.player_hand), self.dealer_hand[0], self.player_hand.count(11))

    # Take a step in the game
    # If the action is 0, then the player wants to hit meaning, draw a card
    # If the action is 1, then the player wants to stick meaning, stop drawing cards
    # Return the next state, reward and whether the game is over
    def step(self, action):
        if action == 0:  # hit/draw
            self.draw_card(self.player_hand)
            if self.calculate_hand_value(self.player_hand) > 21:
                return self.get_state(), -1, True
            return self.get_state(), 0, False
        else:  # stick/stop
            while self.calculate_hand_value(self.dealer_hand) < 17:
                self.draw_card(self.dealer_hand)
            player_value = self.calculate_hand_value(self.player_hand)
            dealer_value = self.calculate_hand_value(self.dealer_hand)
            if player_value > 21:
                return self.get_state(), -1, True
            elif dealer_value > 21:
                return self.get_state(), 1, True
            elif player_value > dealer_value:
                return self.get_state(), 1, True
            elif player_value < dealer_value:
                return self.get_state(), -1, True
            else:
                return self.get_state(), 0, True

# Monte Carlo Control
def monte_carlo_control_dealer(episodes=50000, epsilon=0.1):
    q_values = {}  # Q(s, a)
    returns = {}  # Returns(s, a)
    state_action_count = {} #state action pair visit count
    env = Blackjack()

    def get_action(state):
        if random.random() < epsilon:
            return random.choice([0, 1])  # Explore
        else:
            if state in q_values:
                return max(q_values[state], key=q_values[state].get) #Exploit
            else:
                return random.choice([0, 1])

    for _ in range(episodes):
        state = env.start_game()
        episode = []
        while True:
            action = get_action(state)
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            if done:
                break
        #Compute returns and Update Q
        visited_state_action_pairs = []
        for i, (state, action, reward) in enumerate(episode):
            if (state, action) not in visited_state_action_pairs:
                visited_state_action_pairs.append((state,action))
                G = sum([x[2] * (1) for x in episode[i:]]) #discount factor = 1
                if (state, action) not in returns:
                    returns[(state, action)] = []
                    q_values[state] = {0: 0, 1: 0}
                    state_action_count[(state,action)] = 0
                returns[(state, action)].append(G)
                state_action_count[(state,action)] += 1
                q_values[state][action] = sum(returns[(state, action)]) / state_action_count[(state,action)]

    return q_values

q_values = monte_carlo_control_dealer()
print(q_values)

{(11, 4, 0): {0: 0.22666666666666666, 1: -0.7777777777777778}, (15, 4, 0): {0: -0.37349397590361444, 1: -0.42857142857142855}, (7, 5, 0): {0: -0.05263157894736842, 1: -0.6}, (9, 7, 0): {0: 0.07692307692307693, 1: -0.3333333333333333}, (5, 10, 0): {0: -0.5209580838323353, 1: -0.6}, (12, 10, 0): {0: -0.4338292873923258, 1: -0.5706214689265536}, (18, 6, 0): {0: -0.8064516129032258, 1: 0.17479674796747968}, (16, 3, 0): {0: -0.5454545454545454, 1: -0.32710280373831774}, (15, 11, 0): {0: -0.6454545454545455, 1: -0.9230769230769231}, (12, 11, 0): {0: -0.554016620498615, 1: -1.0}, (14, 8, 0): {0: -0.68, 1: -0.3633633633633634}, (16, 11, 0): {0: -0.728, 1: -0.84}, (15, 7, 0): {0: -0.365625, 1: -0.5652173913043478}, (19, 4, 0): {0: -0.47368421052631576, 1: 0.4409937888198758}, (16, 10, 0): {0: -0.5727739726027398, 1: -0.5757575757575758}, (20, 10, 0): {0: -0.9278350515463918, 1: 0.42706838833094213}, (19, 10, 0): {0: -0.7647058823529411, 1: -0.02464788732394366}, (17, 10, 0): {0: -0.549549549549

In [6]:
def play_blackjack_user():
    env = Blackjack()
    dealer_q_values = monte_carlo_control_dealer() #dealer learns
    state = env.start_game()
    done = False

    while not done:
        print(f"Your hand: {env.player_hand}, Total: {env.calculate_hand_value(env.player_hand)}")
        print(f"Dealer showing: {env.dealer_hand[0]}")

        action = input("Enter 0 to hit, 1 to stick: ")
        try:
            action = int(action)
            if action not in [0, 1]:
                raise ValueError
        except ValueError:
            print("Invalid input. Please enter 0 or 1.")
            continue

        state, reward, done = env.step(action)

        if done:
            print(f"Your hand: {env.player_hand}, Total: {env.calculate_hand_value(env.player_hand)}")
            print(f"Dealer hand: {env.dealer_hand}, Total: {env.calculate_hand_value(env.dealer_hand)}")
            if reward == 1:
                print("You win!")
            elif reward == -1:
                print("You lose!")
            else:
                print("Draw!")

In [7]:
play_blackjack_user()

Your hand: [10, 10], Total: 20
Dealer showing: 6
Enter 0 to hit, 1 to stick: 1
Your hand: [10, 10], Total: 20
Dealer hand: [6, 9, 10], Total: 25
You win!
