In [None]:
# !pip install pettingzoo 
# !pip install pygame 
# !pip install rlcard

Leduc Poker:
https://pettingzoo.farama.org/environments/classic/leduc_holdem/

Please see this site for rules and observation space and action space

CFR:
- strategy_sum is a data structure (typically a dictionary) that accumulates the strategies used by the player for each game state across many iterations of the algorithm.
- regret_sum in CFR accumulates the regrets with respect to the strategies chosen over the iterations
- regreats is utility - Expected utility: regrets[action] += utility(action) - expected_utility over all actions
- The strategy is similar to a policy, so cfr_player.get_average_strategy(state) will give you a probability of actions given the state.
    - For example: 
        - actual_utility = 3 # Received for taking 'call'
        - counterfactual_utilities = [4 (if raised), 3 (if called), 1 (if folded)]
        - regrets[0] = (4 - 3)  # Regret for not raising
        - regrets[1] = (3 - 3)  # Regret for not calling (no regret)
        - regrets[2] = (1 - 3)  # Regret for not folding
        - regreat[a] = 3 - (p(0)*4 + p(1)*3 + p(2)*(-2))
    - Not saying this is the best regret function, but it is one.


# Random Agents

Note that the environment provides an action mask, which we should utilize. If an agent takes an illegal move, it immediately loses the game.

In [17]:
import numpy as np
from pettingzoo.classic import leduc_holdem_v4
import random

# Create the Leduc Hold'em environment
env = leduc_holdem_v4.env()
env.reset()

def get_legal_action(action_space, action_mask): 
    legal_actions = [action for action, mask in enumerate(action_mask) if mask == 1] 
    return legal_actions

# Perform random actions in the environment
for agent in env.agent_iter():
    observation, reward, done, truncation, info = env.last()
    print(agent, observation, reward)
    if done:
        action = None
    else:
        action_mask = observation['action_mask']
        action_mask[2] = 0
        action = random.choice(get_legal_action(env.action_space(agent), action_mask))
        print(action)
    env.step(action)
    env.render()


env.close()


player_1 {'observation': array([1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32), 'action_mask': array([1, 1, 1, 0], dtype=int8)} 0
1
player_0 {'observation': array([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32), 'action_mask': array([1, 1, 1, 0], dtype=int8)} 0
0
player_1 {'observation': array([1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32), 'action_mask': array([0, 1, 1, 1], dtype=int8)} 0
3
player_0 {'observation': array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32), 'action_mask': a

# Player_0 is CFR; player_1 is random

To calculate regret, we need to model the opponent's behavior. We don't have a highly skilled or adversarial agent available, so given the simplicity of Leduc Poker, it is reasonable to assume a random agent as the opponent (not ideal, but sufficient for our purposes). Naturally, both players will use the action mask.

Let's see if our player can discover the optimal strategy or reach a Nash equilibrium.

In [29]:
class CFRPlayer:
    def __init__(self):
        self.regret_sum = defaultdict(lambda: np.zeros(4))  # Regrets for four possible actions
        self.strategy_sum = defaultdict(lambda: np.zeros(4))  # Strategy for four actions

    def get_strategy(self, state, action_mask):
        # Get the cumulative regrets for the state
        regrets = self.regret_sum[state]
        
        # Apply regret matching only on legal actions
        positive_regrets = np.maximum(regrets, 0) * action_mask  # Mask out illegal actions
        normalizing_sum = np.sum(positive_regrets)
        
        # Define the strategy, normalized over legal actions
        strategy = (
            positive_regrets / normalizing_sum if normalizing_sum > 0 else action_mask / np.sum(action_mask)
        )
        
        # Track strategy for this state for averaging
        self.strategy_sum[state] += strategy
        return strategy

    def get_action(self, state, action_mask):
        strategy = self.get_strategy(state, action_mask)
        return np.random.choice(len(action_mask), p=strategy)  # Chooses based on masked strategy

    def update_regrets(self, state, action, utility, expected_utility):
        regrets = self.regret_sum[state]
        regrets[action] += utility - expected_utility

    def get_average_strategy(self, state):
        strategy_sum = self.strategy_sum[state]
        normalizing_sum = np.sum(strategy_sum)
        return (
            strategy_sum / normalizing_sum if normalizing_sum > 0 else np.ones(4) / 4
        )

# Updated CFR Simulation with Action Mask
def cfr_iteration(env, cfr_player):
    state_action_utility = {}
    env.reset()

    # Each player takes actions
    for agent in env.agent_iter():
        obs, reward, done, _, _ = env.last()  # Adjusted to unpack only three values
        if done:
            break

        action_mask = obs['action_mask']  # Get the action mask
        state = tuple(obs['observation'])  # State representation

        if agent == "player_0":  # CFR Player
            strategy = cfr_player.get_strategy(state, action_mask)
            action = np.random.choice(len(action_mask), p=strategy)
            env.step(action)
            # Store utilities and regrets after the fact
            state_action_utility[(state, action)] = reward

        else:  # player_1 (Random)
            legal_actions = [i for i, legal in enumerate(action_mask) if legal]
            action = random.choice(legal_actions)
            env.step(action)

    # Update CFR regrets
    for (state, action), utility in state_action_utility.items():
        expected_utility = sum(
            prob * state_action_utility.get((state, act), 0)
            for act, prob in enumerate(cfr_player.get_strategy(state, action_mask))
        )
        cfr_player.update_regrets(state, action, utility, expected_utility)
        
# Run CFR on Leduc Poker
num_iterations = 1000
cfr_player = CFRPlayer()
env = leduc_holdem_v4.env()
for _ in range(num_iterations):
    cfr_iteration(env, cfr_player)

# Show final strategy
for state in cfr_player.strategy_sum:
    print(f"State {state}: Average strategy {cfr_player.get_average_strategy(state)}")


State (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.29376774 0.31332407 0.34333796 0.04957022]
State (0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.29469347 0.3125     0.34375    0.04905653]
State (0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.42206727 0.07013946 0.46493027 0.042863  ]
State (0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.40445026 0.19109948 0.40445026 0.      

# Player_0 is CFR; player_1 always calls (if legal, otherwise fold)

Lets see if we change the opponent strategy, how does that effect our strategy

In [37]:
class CFRPlayer:
    def __init__(self):
        self.regret_sum = defaultdict(lambda: np.zeros(4))  # Regrets for four possible actions
        self.strategy_sum = defaultdict(lambda: np.zeros(4))  # Strategy for four actions

    def get_strategy(self, state, action_mask):
        # Get the cumulative regrets for the state
        regrets = self.regret_sum[state]
        
        # Apply regret matching only on legal actions
        positive_regrets = np.maximum(regrets, 0) * action_mask  # Mask out illegal actions
        normalizing_sum = np.sum(positive_regrets)
        
        # Define the strategy, normalized over legal actions
        strategy = (
            positive_regrets / normalizing_sum if normalizing_sum > 0 else action_mask / np.sum(action_mask)
        )
        
        # Track strategy for this state for averaging
        self.strategy_sum[state] += strategy
        return strategy

    def get_action(self, state, action_mask):
        strategy = self.get_strategy(state, action_mask)
        return np.random.choice(len(action_mask), p=strategy)  # Chooses based on masked strategy

    def update_regrets(self, state, action, utility, expected_utility):
        regrets = self.regret_sum[state]
        regrets[action] += utility - expected_utility

    def get_average_strategy(self, state):
        strategy_sum = self.strategy_sum[state]
        normalizing_sum = np.sum(strategy_sum)
        return (
            strategy_sum / normalizing_sum if normalizing_sum > 0 else np.ones(4) / 4
        )


def cfr_iteration(env, cfr_player):
    state_action_utility = {}
    env.reset()

    # Each player takes actions
    for agent in env.agent_iter():
        obs, reward, done, _, _ = env.last()
        if done:
            break

        action_mask = obs['action_mask']
        state = tuple(obs['observation'])

        if agent == "player_0":  # CFR Player
            strategy = cfr_player.get_strategy(state, action_mask)
            action = np.random.choice(len(action_mask), p=strategy)
            env.step(action)
            # Store utilities and regrets after the fact
            state_action_utility[(state, action)] = reward

        else: # player_1 (Call or Fold) 
            if action_mask[0]: # Check if action 0 (call) is legal 
                action = 0 
            else: 
                action = 2
            env.step(action)

    # Update CFR regrets
    for (state, action), utility in state_action_utility.items():
        expected_utility = sum(
            prob * state_action_utility.get((state, act), 0)
            for act, prob in enumerate(cfr_player.get_strategy(state, action_mask))
        )
        cfr_player.update_regrets(state, action, utility, expected_utility)
        
# Run CFR on Leduc Poker
num_iterations = 1000
cfr_player = CFRPlayer()
env = leduc_holdem_v4.env()
for _ in range(num_iterations):
    cfr_iteration(env, cfr_player)

# Show final strategy
for state in cfr_player.strategy_sum:
    print(f"State {state}: Average strategy {cfr_player.get_average_strategy(state)}")
  



State (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.01910828 0.33333333 0.33333333 0.31422505]
State (0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.01886792 0.33333333 0.33333333 0.31446541]
State (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.03985507 0.33333333 0.33333333 0.29347826]
State (0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0): Average strategy [0.01364522 0.33333333 0.33333333 0.319688

# Play game against random agent using above strategy

In [50]:
def play_games_with_avg_strategy(cfr_player, num_games):
    results = []

    for _ in range(num_games):
        env = leduc_holdem_v4.env()
        env.reset()
        game_rewards = {"player_0": 0, "player_1": 0}

        for agent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()
            done = termination or truncation
            action_mask = observation['action_mask']
            state = tuple(observation['observation'])

            if done:
                game_rewards[agent] += reward
                env.step(None)  # Ensure we call step with None for dead agents
            else:
                if agent == "player_0":  # CFR Player with average strategy
                    strategy = cfr_player.get_average_strategy(state)
                    legal_actions = [i for i, legal in enumerate(action_mask) if legal]
                    legal_strategy = [strategy[i] for i in legal_actions]
                    legal_strategy = legal_strategy / np.sum(legal_strategy)  # Normalize
                    action = np.random.choice(legal_actions, p=legal_strategy)
                    env.step(action)
                else:  # Random player
                    legal_actions = [i for i, legal in enumerate(action_mask) if legal]
                    action = random.choice(legal_actions)
                    env.step(action)

            env.render()

        # Record the final reward for player_0
        results.append(game_rewards["player_0"])
        env.close()  # Close the environment after each game

    average_reward = np.mean(results)
    print(f"Average reward for player_0 over {num_games} games: {average_reward}")

# Run the evaluation phase
num_games = 1000
play_games_with_avg_strategy(cfr_player, num_games)


Average reward for player_0 over 1000 games: 0.0245


It seems we win more games than we lose, which is promising considering the stochastic, partially observable, and adversarial nature of the game.

One might ask: If Counterfactual Regret Minimization (CFR) can handle stochastic, partially observable, adversarial games, why not apply it to games like Minesweeper and treat the environment as the adversary? There's no clear-cut answer since each problem has its unique challenges. 

While CFR is used in real poker with substantial resources, sometimes leveraging deep learning. But Calculating a comprehensive strategy for complex games often demands more resources than I can manage.