In [2]:
import numpy as np
from tqdm import trange

In [3]:
"""
The deck resets after each simulation so there will be no card counting. Hence the model will have a negative expected value

"""

STAND, HIT, DOUBLE_DOWN, SPLIT = 0, 1, 2, 3
DECK_COUNT = 1
DECK = np.repeat(np.array([2,3,4,5,6,7,8,9,10,10,10,10,11]), 4)

In [4]:
class Node():
    def __init__(self, log_info, num_actions):
        self.log_info = log_info
        self.num_actions = num_actions
        self.regret_sum = np.zeros(num_actions)
        self.strategy_sum = np.zeros(num_actions)

    def __repr__(self):
        return self.log_info + "\t" + ''.join([format(strat*100, '.0f') + "%\t" for strat in self.normalize(self.strategy_sum)])

    def normalize(self, value):
        normalizing_sum = np.sum(value)
        if normalizing_sum > 0:
            return value / normalizing_sum
        return np.ones(self.num_actions) / self.num_actions

    def get_strategy(self):
        return self.normalize(np.maximum(self.regret_sum, 0))

    def get_action(self):
        strategy = self.normalize(self.strategy_sum)
        return np.searchsorted(np.cumsum(strategy), np.random.random())

In [5]:
def get_hand_value(cards):
    ace_count = np.count_nonzero(cards == 11)
    value = np.sum(cards)
    while value > 21 and ace_count > 0:
        value -= 10
        ace_count -= 1
    return value

def get_terminal_payout(player_value, dealer_cards, deck, top_card_index, action, bet):
    if action is None: # Cards have just been dealt
        if get_hand_value(dealer_cards) == 21:
            if player_value == 21:
                return 0
            return -bet

        if player_value == 21:
            return 3/2 * bet
        
        return None
    
    if action == DOUBLE_DOWN:
        return get_terminal_payout(player_value, dealer_cards, deck, top_card_index, STAND, bet * 2)

    if player_value > 21:
        return -bet
        
    if action == STAND:
        dealer_value = get_hand_value(dealer_cards)
        while dealer_value < 17:
            dealer_cards = np.append(dealer_cards, deck[top_card_index])
            top_card_index += 1
            dealer_value = get_hand_value(dealer_cards)
        
        if dealer_value > 21:
            return bet
        return np.sign(player_value - dealer_value) * bet
        
    return None

def get_node(nodes, player_value, ace_count, can_split, dealer_card) -> Node:
    info_set_hash = str(can_split > 0) + str(ace_count > 0) + str(player_value) + str(dealer_card)
    num_actions = 4 if can_split else 3
    node = nodes.get(info_set_hash, Node(str(player_value) + ("A" if ace_count > 0 else "") + "\t" + str(dealer_card), num_actions))
    nodes[info_set_hash] = node
    return node

def counter_factual_regret(nodes, player_value, ace_count, can_split, dealer_cards, deck, top_card_index, action):
    if action is SPLIT:
        utility = counter_factual_regret(nodes, player_value//2, ace_count//2, False, dealer_cards, deck, top_card_index, HIT)
        # top_card_index+4 is a hack. The "realistic" solution would be syncronize top_card_index
        utility += counter_factual_regret(nodes, player_value//2, ace_count//2, False, dealer_cards, deck, top_card_index+4, HIT)
        return utility

    if action is HIT or action == DOUBLE_DOWN:
        player_value += deck[top_card_index]
        if deck[top_card_index] == 11:
            ace_count += 1
        top_card_index += 1
        
    while player_value > 21 and ace_count > 0:
        player_value -= 10
        ace_count -= 1

    payout = get_terminal_payout(player_value, dealer_cards, deck, top_card_index, action, 1)
    if payout is not None:
        return payout

    node = get_node(nodes, player_value, ace_count, can_split, dealer_cards[0])
    strategy = node.get_strategy()
    node.strategy_sum += strategy
    
    utility = np.zeros(node.num_actions)
    node_utility = 0

    for action in range(node.num_actions):
        utility[action] = counter_factual_regret(nodes, player_value, ace_count, False, dealer_cards, deck, top_card_index, action)
        node_utility += strategy[action] * utility[action]

    node.regret_sum += utility - node_utility
    return node_utility

def train(iterations):
    nodes = {}
    cards = np.repeat(DECK, DECK_COUNT)

    utility = 0
    for i in trange(iterations):
        np.random.shuffle(cards)
        has_ace = 11 in cards[0:2]
        can_split = cards[0] == cards[1]
        utility += counter_factual_regret(nodes, np.sum(cards[0:2]), has_ace, can_split, cards[2:4], cards, 4, None)

        # Reset average strategy sum to forget bad early decisions
        if i == iterations // 2:
            for node in nodes.values():
                node.strategy_sum[:] = 0

    # Average game value is expected to be negative. The casino has the edge
    print("Average game value:", utility / iterations)

    
    LOG_NODE_PROBABILITIES = True
    if LOG_NODE_PROBABILITIES:
        print("Player\tDealer\tSTAND\tHIT\tDD\tSPLIT")
        nodes_values = dict(sorted(nodes.items(), key=lambda item: item[0]))
        for n in nodes_values.values():
            print(n)
    return nodes

In [6]:
nodes = train(300000)

100%|██████████| 300000/300000 [00:31<00:00, 9552.92it/s] 

Average game value: -0.005165964646326112
Player	Dealer	STAND	HIT	DD	SPLIT
10	10	0%	100%	0%	
10	11	0%	0%	100%	
10	2	0%	0%	100%	
10	3	0%	0%	100%	
10	4	0%	0%	100%	
10	5	0%	0%	100%	
10	6	0%	0%	100%	
10	7	0%	0%	100%	
10	8	0%	0%	100%	
10	9	0%	0%	100%	
11	10	0%	0%	100%	
11	11	0%	53%	47%	
11	2	0%	0%	100%	
11	3	0%	0%	100%	
11	4	0%	0%	100%	
11	5	0%	0%	100%	
11	6	0%	0%	100%	
11	7	0%	0%	100%	
11	8	0%	0%	100%	
11	9	0%	0%	100%	
12	10	0%	100%	0%	
12	11	0%	100%	0%	
12	2	5%	95%	0%	
12	3	0%	100%	0%	
12	4	100%	0%	0%	
12	5	100%	0%	0%	
12	6	45%	55%	0%	
12	7	0%	100%	0%	
12	8	0%	100%	0%	
12	9	0%	100%	0%	
13	10	0%	100%	0%	
13	11	0%	100%	0%	
13	2	99%	1%	0%	
13	3	100%	0%	0%	
13	4	100%	0%	0%	
13	5	100%	0%	0%	
13	6	100%	0%	0%	
13	7	0%	100%	0%	
13	8	0%	100%	0%	
13	9	0%	100%	0%	
14	10	0%	100%	0%	
14	11	0%	100%	0%	
14	2	100%	0%	0%	
14	3	100%	0%	0%	
14	4	100%	0%	0%	
14	5	100%	0%	0%	
14	6	100%	0%	0%	
14	7	0%	100%	0%	
14	8	0%	100%	0%	
14	9	0%	100%	0%	
15	10	0%	100%	0%	
15	11	0%	100%	0%	
15	2	100%	0%	0%	
15	3	100%	0%	0


