In [2]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=5)
np.set_printoptions(suppress=True)

In [3]:
"""
Possible Kuhn Poker Games
Player 1    Player 2    Player 1    Payoff
pass        pass                    +1 to player with higher card
pass        bet         pass        +1 to player 2
pass        bet         bet         +2 to player with higher card
bet         pass                    +1 to player 1
bet         bet +2                  to player with higher card

12 Informationsets:
Initial state
    King
    Queen
    Jack

1st option
    King    p1 pass
    King    p1 bet
    Queen   p1 pass
    Queen   p1 bet
    Jack    p1 pass
    Jack    p1 bet

2nd option
    King    p1 bet p2 bet 
    Queen   p1 bet p2 bet
    Jack    p1 bet p2 bet
"""

CARDS = ["J", "Q", "K"]
ACTIONS = ["PASS", "BET"]
J, Q, K = 0, 1, 2
PASS, BET = 0, 1
NUM_ACTIONS = 2

In [4]:
class Node():
    def __init__(self, card, history, num_actions):
        # card and history are only used for node state printing in __repr__
        self.card = card
        self.history = history

        self.num_actions = num_actions
        self.regret_sum = np.zeros(num_actions)
        self.strategy_sum = np.zeros(num_actions)

    def __repr__(self):
        return CARDS[self.card] + " " + str([ACTIONS[h] for h in self.history]) + " " + str([ACTIONS[i] + " " + format(strat*100, '.0f') + "%" for i, strat in enumerate(self.normalize(self.strategy_sum))])

    def normalize(self, value):
        normalizing_sum = np.sum(value)
        if normalizing_sum > 0:
            return value / normalizing_sum
        return np.ones(self.num_actions) / self.num_actions

    def get_strategy(self):
        return self.normalize(np.maximum(self.regret_sum, 0))
    
    def get_action(self):
        strategy = self.normalize(self.strategy_sum)
        return np.searchsorted(np.cumsum(strategy), np.random.random())

In [5]:
def get_terminal_payout(history, player_card, opponent_card):
    if len(history) < 2:
        return None
    
    terminal_pass = history[-1] == PASS
    showdown_payout = 1 if player_card > opponent_card else -1
    if terminal_pass:
        if history[0] == history[1] == PASS:
            return showdown_payout
        return 1
    
    double_bet = history[-1] == history[-2] == BET
    if double_bet:
        return showdown_payout * 2
    
    return None

def get_node(nodes, history, player_card) -> Node:
    info_set_hash = int(player_card + np.sum([(info+1) * (i+2)**NUM_ACTIONS for i, info in enumerate(history)]))
    node = nodes.get(info_set_hash, Node(player_card, history, NUM_ACTIONS))
    nodes[info_set_hash] = node
    return node

def counter_factual_regret(nodes, cards, history, realization_weight_p0, realization_weight_p1):
    player = len(history) % 2
    opponent = 1 - player

    payout = get_terminal_payout(history, cards[player], cards[opponent])
    if payout is not None:
        return payout
    
    player0_turn = player == 0

    node = get_node(nodes, history, cards[player])
    strategy = node.get_strategy()
    node.strategy_sum += strategy * (realization_weight_p0 if player0_turn else realization_weight_p1)
    
    utility = np.zeros(NUM_ACTIONS)
    node_utility = 0

    for action in range(NUM_ACTIONS):
        next_history = history + [action]
        p0_weight = realization_weight_p0 if not player0_turn else strategy[action] * realization_weight_p0
        p1_weight = realization_weight_p1 if player0_turn else strategy[action] * realization_weight_p1

        utility[action] = -counter_factual_regret(nodes, cards, next_history, p0_weight, p1_weight)
        node_utility += strategy[action] * utility[action]

    node.regret_sum += (utility - node_utility) * (realization_weight_p1 if player0_turn else realization_weight_p0)
    return node_utility

def train(iterations):
    cards = np.array([J, Q, K])
    nodes = {}

    utility = 0
    for _ in range(iterations):
        np.random.shuffle(cards)
        utility += counter_factual_regret(nodes, cards[:2], [], 1, 1)

    # Theoretical game optimal strategy
    # https://en.wikipedia.org/wiki/Kuhn_poker#Optimal_strategy
    print("Average game value:", utility / iterations)
    print("Theoretical nash equilibrium average game value:", -1/18)
    nodes = dict(sorted(nodes.items(), key=lambda item: [item[1].card] + [item[1].history]))
    for n in nodes.values():
        print(n)

In [6]:
train(10000)

Average game value: -0.05824832134121574
Theoretical nash equilibrium average game value: -0.05555555555555555
J [] ['PASS 76%', 'BET 24%']
J ['PASS'] ['PASS 65%', 'BET 35%']
J ['PASS', 'BET'] ['PASS 100%', 'BET 0%']
J ['BET'] ['PASS 100%', 'BET 0%']
Q [] ['PASS 99%', 'BET 1%']
Q ['PASS'] ['PASS 100%', 'BET 0%']
Q ['PASS', 'BET'] ['PASS 38%', 'BET 62%']
Q ['BET'] ['PASS 67%', 'BET 33%']
K [] ['PASS 19%', 'BET 81%']
K ['PASS'] ['PASS 0%', 'BET 100%']
K ['PASS', 'BET'] ['PASS 0%', 'BET 100%']
K ['BET'] ['PASS 0%', 'BET 100%']
