In [4]:
%pip install treys
%pip install pokerkit

Note: you may need to restart the kernel to use updated packages.
Collecting pokerkit
  Downloading pokerkit-0.5.4-py3-none-any.whl.metadata (17 kB)
Downloading pokerkit-0.5.4-py3-none-any.whl (99 kB)
Installing collected packages: pokerkit
Successfully installed pokerkit-0.5.4
Note: you may need to restart the kernel to use updated packages.


In [25]:
from pokerkit import Automation, NoLimitTexasHoldem, State, calculate_hand_strength, parse_range, Card, Deck, StandardHighHand
from typing import List
from tqdm import tqdm
from enum import Enum
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

#### Agent Definition

In [187]:
class Action(Enum):
    FOLD = 0
    CALL = 1
    BET = 2

In [310]:
class Agent:
    def __init__(self, name: str = None, starting_stack: int = 0):
        self.name = name
        self.starting_stack = starting_stack
        self.stack = starting_stack

    def reset(self):
        self.stack = self.starting_stack
        return self
    
    def pi_action_generator(self, state: State) -> dict:
        pass

    def get_player_state(self, state: State, player_index: int) -> List:
        # Calculate percent chance of winning
        win_percent = self._calculate_strength(state, player_index)
        
        stack = state.stacks[player_index]
        pot = state.total_pot_amount

        min_bet = state.min_completion_betting_or_raising_to_amount
        max_bet = state.max_completion_betting_or_raising_to_amount
        
        if min_bet is None:
            min_bet = 4
        if max_bet is None:
            max_bet = stack
        
        return [win_percent, stack, pot, min_bet, max_bet]

    def _get_valid_actions(self, state: State):
        if state.can_fold():
            yield Action.FOLD
        if state.can_check_or_call():
            yield Action.CALL
        if state.can_complete_bet_or_raise_to():
            yield Action.BET

    def _calculate_strength(self, state: State, player_index: int, samples: int = 500) -> float:
        return calculate_hand_strength(
            state.player_count,
            parse_range(''.join([str(c.rank + c.suit) for c in state.hole_cards[player_index]])),
            Card.parse(''.join([str(c[0].rank + c[0].suit) for c in state.board_cards])),
            2,
            5,
            Deck.STANDARD,
            (StandardHighHand,),
            sample_count=samples
        )

In [311]:
class ExampleRandomAgent(Agent):
    def __init__(self, name: str = None, stack: int = 1000):
        super().__init__(name, stack)

    def pi_action_generator(self, state: State) -> dict:
        valid_actions = list(self._get_valid_actions(state))
            
        valid_bet_low = state.min_completion_betting_or_raising_to_amount
        valid_bet_high = state.max_completion_betting_or_raising_to_amount
        chosen_action = np.random.choice(valid_actions)

        bet_size = 0
        if chosen_action is Action.BET:
            bet_size = round(np.random.uniform(valid_bet_low, valid_bet_high))

        table_action = {
            'table_action': chosen_action,
            'bet_size': bet_size
        }
        return table_action

In [312]:
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PolicyNetwork, self).__init__()
        self.hidden_layer = nn.Linear(input_size, hidden_size)    #the hidden layer with hidden_size neurons
        #nn.init.xavier_uniform_(self.hidden_layer.weight)     # Initialize the weights with Xavier initialization
        nn.init.normal_(self.hidden_layer.weight, mean = 0, std = 0.01)
        nn.init.normal_(self.hidden_layer.bias, mean = 0, std = 0.01)
        self.action_output = nn.Linear(hidden_size, 3)    #the output layer with outputs as prob of stopping, mean, and variance of normal
        self.bet_output = nn.Linear(hidden_size, 1)    #the output layer with outputs as prob of stopping, mean, and variance of normal

        
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def forward(self, s):
        '''A function to do the forward pass
            Takes:
                s -- the state representation
            Returns:
                a tensor of probabilities
        '''
        s = torch.relu(self.hidden_layer(s))    #pass through the hidden layer
        a = self.action_output(s)
        a = torch.softmax(a, dim=1)
        
        b = self.bet_output(s)
        b = torch.exp(b)/(1 + torch.exp(b))
        return a, b

In [352]:
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
class PolicyAgent(Agent):
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    
    def __init__(self, name, stack, config):
        super().__init__(name, stack)    #init the parent class
        self.config = config
        self.mu_s2 = PolicyNetwork(5, self.config['hidden_layer_size'])    #init the policy model
        self.optimizer = optim.Adam(self.mu_s2.parameters(), lr=self.config['learning_rate'])    #init the optimizer

    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@   
    def action_probs(self, a, s):
        '''A function to compute the logged action probabilities.  This will used used for gradient updates.
            Takes:
                a -- -1 (stop) or float in [0,1]
                state -- float in [0,100]
            Returns:
                torch tensor
        '''
        actions, bet_ratio = self.mu_s2(torch.tensor([s]))    #compute stop prob, mean, sd

        if a[0] == Action.FOLD:   #if the action was to stop...
            log_p = torch.log(actions[0][0])
        elif a[0] == Action.CALL:    #if the action was to ask for a...
            log_p = torch.log(actions[0][1])
        else:    #if the action was to ask for a in [0,1]...
            action_log_p = torch.log(actions[0][2])

            low = torch.max(bet_ratio[0][0] - self.config['action_var'], torch.tensor([0.0]))
            high = torch.min(bet_ratio[0][0] + self.config['action_var'], torch.tensor([1.0]))
            U = torch.distributions.Uniform(low, high)
            bet_log_p = U.log_prob(torch.tensor(a[1])) 

            log_p = (action_log_p + bet_log_p)[0]
        return log_p

    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@   
    def pi_action_generator(self, s: State):
        '''A function to generate an action.  This will be used to generate the data.
            Takes:
                state -- float in [0,100]
            Returns:
                 -1 or a in [0,1]

        '''
        state = self.get_player_state(s, s.actor_index)    #get the state
        
        actions, bet_ratio = self.mu_s2(torch.tensor([state]))    #generate policy parameters
        actions, bet_ratio = torch.squeeze(actions), torch.squeeze(bet_ratio)    #reshape

        fold, call, bet = self._calculate_valid_action_values(s, actions)
        bet_size = 0    #init the bet size
        action_chance = np.random.uniform()    #generate a random number to decide what to do
        if action_chance < float(fold):
            a = Action.FOLD    #set the relevant action
        elif action_chance < float(fold) + float(call):
            a = Action.CALL
        else:    #if not...
            a = Action.BET    #set the relevant action
            min_bet_size, max_bet_size = state[-2], state[-1]    #pull out the min and max bet sizes

            low = torch.max(bet_ratio - self.config['action_var'],torch.tensor([0.0]))
            high = torch.min(bet_ratio + self.config['action_var'],torch.tensor([1.0]))
            U = torch.distributions.Uniform(low, high)
            bet_ratio = float(U.sample())

            bet_size = int((bet_ratio * (max_bet_size - min_bet_size)) + min_bet_size)    #compute the bet size
            
        return {
            'table_action': a,
            'bet_size': bet_size,
            'bet_ratio': bet_ratio
        }

    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def objective(self, log_probs, episode_return, b):
        '''A function to compute the objective
            Takes:
                log_probs -- tensor, the output from the forward pass
                causal_return -- tensor, the causal return as defined in lecture
                b -- float, the baseline as defined in lecture
        '''
        return -torch.sum(log_probs * (episode_return - b))

    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def update_pi(self, batch):
        '''A function to update the gradient of the agent.
            Takes:
                batch -- a list of dictionary containing episode histories
        '''
        objective = []    #init the objectives per episode   
        for j in range(self.config['N']):    #loop over episodes
            batch_j = batch[j]    #pull out episode j
            log_probs = []    #init the log probs for this episode
            for s, a in zip(batch_j['states'][:len(batch_j['states'])-1], batch_j['actions']):    #loop over state action pairs
                log_prob = self.action_probs(a, s)    #compute the log prob for this state action pair
                log_probs.append(log_prob)    #record
            log_probs = torch.stack(log_probs)    #reshape to compute gradient over the whole episode
            if self.config['causal_return']:    #if we use causal returns...
                batch_j_reward = batch_j['causal_return']    #set that
            else:    #if not...
                batch_j_reward = batch_j['total_return']    #use the total discounted reward

            objective.append(self.objective(log_probs, batch_j_reward, batch_j['baseline']))    #compute the objective function and record
        
        objective = torch.mean(torch.stack(objective))    #reshape
        
        #run the backward pass to compute gradients
        self.optimizer.zero_grad()    #zero gradients from the previous step
        objective.backward()    #compute gradients
        self.optimizer.step()    #update policy network parameters\n"
        
    def _calculate_valid_action_values(self, state: State, actions: torch.Tensor):
        valid_actions = list(self._get_valid_actions(state))
        actions = actions.detach().numpy()
        
        valid_action_values = actions[[Action.FOLD in valid_actions, Action.CALL in valid_actions, Action.BET in valid_actions]]
        valid_action_values = valid_action_values / max(valid_action_values)
        valid_action_values = np.exp(valid_action_values)/np.sum(np.exp(valid_action_values))

        fold, call, bet = 0, 0, 0

        num_actions = 0
        if Action.FOLD in valid_actions:
            fold = valid_action_values[num_actions]
            num_actions += 1
        if Action.CALL in valid_actions:
            call = valid_action_values[num_actions]
            num_actions += 1
        if Action.BET in valid_actions:
            bet = valid_action_values[num_actions]
            num_actions += 1

        return fold, call, bet


#### Environment

In [353]:
class TexasHoldemEnvironment():
    def __init__(self, config):
        self.config = config
        self.reset_game()
    
    def reset_game(self):
        self.game = NoLimitTexasHoldem(
            automations= (
                Automation.ANTE_POSTING,
                Automation.BET_COLLECTION,
                Automation.BLIND_OR_STRADDLE_POSTING,
                Automation.CARD_BURNING,
                Automation.HOLE_DEALING,
                Automation.BOARD_DEALING,
                Automation.HOLE_CARDS_SHOWING_OR_MUCKING,
                Automation.HAND_KILLING,
                Automation.CHIPS_PUSHING,
                Automation.CHIPS_PULLING
            ),
            ante_trimming_status=True,  # Uniform antes?
            raw_antes=0,  # Antes
            raw_blinds_or_straddles=self.config['blinds'],  # Blinds
            min_bet=self.config['min_bet'],  # Minimum bet
        )
        self.stacks = np.array([self.config['starting_stack'] for _ in range(self.config['player_count'])])  # Observed Agent stack will always be at position 0
        self.poker_round = None
        
    def reset_round(self, agents: List[Agent]):
        agent_stacks = [agent.stack for agent in agents]
        self.poker_round = self.game(raw_starting_stacks=agent_stacks, player_count=agent_stacks.__len__())

    def step(self, action):
        player_action = action['table_action']
        bet_amount = action['bet_size']

        # Update the environment state with the player action
        if player_action == Action.FOLD:
            self.poker_round.fold()
        elif player_action == Action.CALL:
            self.poker_round.check_or_call()
        elif player_action == Action.BET:
            self.poker_round.complete_bet_or_raise_to(bet_amount)
        
        done = not self.poker_round.status  # Check if the round is done

        # Update the environment stacks if the round is done
        if done:
            self.stacks = self.poker_round.stacks

        return {'state': self.poker_round, 'reward': self.config['ongoing_reward'], 'done': done}

#### Running the Environment

In [354]:
agent_config = {
    'gamma': 0.1,
    'action_var': 1,
    'hidden_layer_size': 16,
    'learning_rate': 0.001,
    'B': 10,
    'N': 4,
    'causal_return': True,
    'baseline': True
}   # Set the configuration for the agent
env_config = {
    'player_count': 4,
    'blinds': (2, 4),
    'min_bet': 4,
    'starting_stack': 200,
    'ongoing_reward': 0.5
}   # Set the configuration for the environment

# Define the players in the game with Agent objects
# The first player in the list will be the tracked player
tracked_agent = PolicyAgent(name='Policy Agent', stack=env_config['starting_stack'], config=agent_config)  # Tracked player
players: List[Agent] = [
    tracked_agent,
    ExampleRandomAgent(name='Random Agent 1', stack=env_config['starting_stack']),
    ExampleRandomAgent(name='Random Agent 2', stack=env_config['starting_stack']),
    ExampleRandomAgent(name='Random Agent 3', stack=env_config['starting_stack'])
]
# Randomly select a player to be the dealer
player_offset = np.random.randint(0, len(players))
# Create the environment with the configuration
env = TexasHoldemEnvironment(env_config)

In [362]:
#run training loop
tracked_agent_stack_sizes = []

for b in tqdm(range(agent_config['B']), desc=f'Poker Batches of {agent_config["N"]} Games'):    #loop over batches
    baseline = 0    #init the baseline
    batch = []    #init the batch
    for _ in range(agent_config['N']):    #loop over episodes
        active_players = [agent.reset() for agent in players]
        game_stack_sizes = [env_config['starting_stack']]
        env.reset_game()
        env.reset_round(active_players)
        
        gamma_array = [1]    #init the discounting
        states = []    #init the state history
        actions = []    #init the action history
        rewards = []    #init the reward history
        
        # Play rounds until there is only one player left or the tracked player is eliminated
        while len(active_players) > 1 and tracked_agent in active_players:
            round_done = False   #Set the stopping condition
            
            # Calculate amount to offset players to get a new dealer
            player_offset = (player_offset + 1) % len(active_players)
            # Rotate players based on offset
            active_players = active_players[player_offset:] + active_players[:player_offset]
            
            # Reset the environment for a round of poker
            env.reset_round(active_players)
            
            # Record the state of the tracked player
            states.append(tracked_agent.get_player_state(env.poker_round, ((env.poker_round.actor_index - player_offset) + len(active_players)) % len(active_players)))
            
            # Play the round until the round is over
            while not round_done:
                # Get the current acting player and their index
                current_player_index = env.poker_round.actor_index
                current_player: Agent = active_players[current_player_index]
                
                # Get the action from the current acting player
                action = current_player.pi_action_generator(env.poker_round)
                # Step the environment with the player's action
                update = env.step(action)
                
                # Update the stopping condition
                round_done = update['done']

                # If the current player is the tracked player, record the data
                if current_player == tracked_agent:
                    states.append(tracked_agent.get_player_state(update['state'], current_player_index) if action['table_action'] != Action.FOLD and not round_done else np.zeros(5))
                    actions.append([action['table_action'], action['bet_ratio']])
                    rewards.append(update['reward'] if action['table_action'] != Action.FOLD else 0)
                    gamma_array.append(gamma_array[-1] * agent_config['gamma'])
                    
            
            # Update stacks for each player
            for agent, stack in zip(active_players, env.stacks):
                agent.stack = stack
                
            #Set the reward for the last state to be the terminal reward
            if len(rewards) > 0:
                rewards[-1] = tracked_agent.stack   
            else:
                rewards.append(tracked_agent.stack)
                
            game_stack_sizes.append(rewards[-1])    #record the stack size at the end of the round
                
            # Update active players in the game by removing players with stacks less than the minimum bet
            active_players = [agent for agent in players if agent.stack > env_config['min_bet']]

        tracked_agent_stack_sizes.append(game_stack_sizes)
        
        if len(states) == len(rewards):
            states.append(np.zeros(5))
        
        states = np.array(states).astype(np.float32)    #convert states to correct datatype for torch operations
        discounted_rewards = rewards * (agent_config['gamma'] ** np.array(range(len(rewards))))    #discount the reward history
        causal_return = np.cumsum((discounted_rewards)[::-1])[::-1]    #compute the causal return
        causal_return = torch.tensor(list(causal_return))    #turn into a torch tensor
        if agent_config['baseline']:    #if we'd like the agent to use baselining...
            baseline += sum(discounted_rewards)    #update the baseline with info from this episode

        batch.append({
            'states': states,
            'actions': actions,
            'rewards': rewards,
            'total_return': sum(discounted_rewards),
            'causal_return': causal_return
        })    #add data from this episode to the batch
        
    for j in range(agent_config['N']):    #once the batch is made loop over episodes
        batch[j]['baseline'] = baseline / agent_config['N']    #add the baseline to each one
    tracked_agent.update_pi(batch)    #run the gradient update\n"

Poker Batches of 4 Games: 100%|██████████| 10/10 [05:00<00:00, 30.02s/it]


In [363]:
tracked_agent_stack_sizes

[[200, 418, 220, 220, 24, 0],
 [200, 198, 25, 37, 5, 5, 1],
 [200, 0],
 [200, 198, 0],
 [200, 7, 0],
 [200,
  530,
  427,
  226,
  118,
  236,
  144,
  140,
  63,
  112,
  116,
  118,
  212,
  187,
  191,
  94,
  92,
  94,
  75,
  71,
  37,
  39,
  37,
  33,
  25,
  21,
  19,
  15,
  2],
 [200, 180, 178, 176, 35, 33, 35, 33, 0],
 [200, 49, 49, 55, 23, 51, 49, 33, 35, 33, 35, 39, 35, 39, 0],
 [200, 0],
 [200, 740, 680, 684, 653, 506, 800],
 [200, 497, 420, 40, 42, 38, 40, 38, 42, 7, 7, 11, 7, 11, 7, 5, 0],
 [200, 200, 200, 196, 381, 389, 387, 391, 453, 455, 453, 135, 97, 194, 16, 0],
 [200, 95, 192, 322, 337, 337, 0],
 [200, 0],
 [200, 198, 194, 190, 188, 212, 410, 412, 416, 418, 41, 82, 86, 172, 176, 0],
 [200, 383, 602, 800],
 [200, 9, 1],
 [200, 19, 43, 41, 88, 84, 85, 20, 8, 4],
 [200, 196, 200, 604, 796],
 [200, 431, 427, 763, 767, 798],
 [200, 20, 0],
 [200, 2],
 [200, 198, 31, 14, 10, 1],
 [200, 604, 608, 610, 614, 616, 432, 434, 68, 0],
 [200, 0],
 [200, 270, 3],
 [200, 391, 551

### Old Code for running with Random Agents, useful for baselining stats

In [None]:
env_config = {
    'player_count': 4,
    'blinds': (2, 4),
    'min_bet': 4,
    'starting_stack': 100
}
# Define the players in the game with Agent objects
players: List[Agent] = [ExampleRandomAgent(f'Player {_}', env_config['starting_stack']) for _ in range(env_config['player_count'])]
# Randomly select a player to be the dealer
player_offset = np.random.randint(0, len(players))
# Create the environment with the configuration
env = TexasHoldemEnvironment(env_config)
# Number of games (episodes) to play
num_games = 1000

for episode in tqdm(range(num_games)):
    # Reset all players and environment
    active_players = [agent.reset() for agent in players]
    env.reset_game()
    env.reset_round(active_players)
    num_rounds = 0   # Number of rounds played in the game

    # Play rounds until there is only one player left or 100 rounds have been played
    while len(active_players) > 1:
        done = False
        # Calculate amount to offset players to get a new dealer
        player_offset = (player_offset + 1) % len(active_players)
        # Rotate players based on offset
        active_players = active_players[player_offset:] + active_players[:player_offset]
        # Reset the environment for a round of poker
        env.reset_round(active_players)

        # Play the round until the round is over
        while not done:
            # Get the current acting player and their action
            current_player = active_players[env.poker_round.actor_index]
            action = current_player.get_action(env.poker_round)
            # Step the environment with the player's action
            current_state = env.step(action)
            # Update the player's observations, actions, and rewards
            done = current_state['done']

        # Update stacks for each player
        for agent, stack in zip(active_players, env.stacks):
            agent.stack = stack

        # Update active players in the game by removing players with 0 stack
        active_players = [agent for agent in players if agent.stack > env_config['min_bet']]
        num_rounds += 1

100%|██████████| 1000/1000 [00:29<00:00, 33.87it/s]
