# Betting Game

## Rules of the game:
1. Everyone puts 1 euro in the pot. 
2. Each player has a uniform random variable, and the active player has the choice to bet (at least 1 euro) on having the highest number, or fold. Whether he bets or not, play continues to the left.
3. If there was a bet from any previous player, the active player has the choice of either calling or folding. Play continues to the left.
4. Play stops once everyone has acted once. The hands are revealed, and of the remaining players (who have not folded), he with the highest number wins the pot. Side pots are calculated appropriately.

## Capabilities of the agents
- **Randomized betting:** The agents are able to mask their hand and bluff. Bluffing frequencies are learned.
- **Bets and calls reflect player position:** Bets made in early position are considered differently than those made in late position.
- **Continuous action space:** The agents are able to maximize expected winnings without discretizing their possible actions.
- **Agents exhibit reasonable behaviour:** Upon observation, the algorithm works!

## Limitations
- Player histories are not remembered. Players do not learn how to exploit particularly bad play from other agents.
- Although infrequent, some unreasonable calls are observed... 

In [123]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import models, transforms # no datasets needed
from torch import nn
import torch.nn.functional as F

In [124]:
ANTI = 1 # everyone antis in at the beginning of the hand.

class Player():
    num_players = 0
    
    def __init__(self, stack, follow_model, lead_model, verbose=False):
        self.stack = stack
        self.follow_model = follow_model
        self.lead_model = lead_model
        self.verbose=verbose
        Player.num_players += 1
        self.id = Player.num_players
        self._new_hand()
    
    def _new_hand(self):
        self.hand = torch.rand(1)
        self.folded = False
    
    def anti(self):
        self._new_hand()
        anti = np.min((ANTI, self.stack))
        self.stack -= anti
        return anti
    
    def lead(self, state, play_randomly=False):
        state = torch.Tensor(state).unsqueeze(0)
        raw_bet = self.lead_model(state)
        bet = F.softplus(raw_bet) - F.softplus(raw_bet - torch.Tensor([self.stack]))
        bet = bet.item()
        if play_randomly:
            bet = np.random.rand()*self.stack
        if bet > ANTI:
            bet = np.min((self.stack, bet))
            if self.verbose:
                print("Player {} bet {}".format(self.id, bet))
            self.stack -= bet
            return bet
        else:
            return 0
    
    def follow(self, state, play_randomly=False):
        bet = state[-1]
        state = torch.Tensor(state).unsqueeze(0)
        expected_if_follow = self.follow_model(state).detach().item()
        if play_randomly:
            expected_if_follow = np.random.rand()-0.5
        if expected_if_follow > 0:
            response = np.min((self.stack, bet))
            if self.verbose:
                print("Player {} accepts".format(self.id))
            self.stack -= response
            return response
        else:
            return 0
    
    def is_active(self):
        return self.stack > 0 and not self.folded
    
    def fold(self):
        if self.verbose and self.is_active():
            print("Player {} folds".format(self.id))
        self.folded = True


class Recorder():
    def __init__(self):
        self.x_state = []
        self.x_bet = []
        self.y = []
        self.initial_stacks = []
        self.players = []
    
    def record_initial(self, state, player, bet):
        self.x_state.append(state)
        self.x_bet.append([bet])
        self.initial_stacks.append(player.stack + bet)
        self.players.append(player)

    def record_final(self):
        for i, p in enumerate(self.players):
            self.y.append([p.stack - self.initial_stacks[i]])
        self.initial_stacks = []
        self.players = []
        
    def get_data(self, replicate=True):
        if replicate:
            x_state = torch.cat((torch.Tensor(self.x_state),
                            torch.Tensor(self.x_state)),
                            dim=0)
            x_bet = torch.cat((torch.rand(len(self.x_bet),1)*ANTI,
                            torch.Tensor(self.x_bet)),
                            dim=0)
            y = torch.cat((torch.zeros((len(self.y),1)),
                          torch.Tensor(self.y)),
                          dim=0)
        else:
            x_state = torch.Tensor(self.x_state)
            x_bet = torch.Tensor(self.x_bet)
            y = torch.Tensor(self.y)
        inds = torch.randperm(y.shape[0])
        return x_state[inds,:], x_bet[inds,:], y[inds,:]


def start_hand(players, lead_index, lead_recorder=None, follow_recorder=None, p_random=0):
    ## SETUP ##
    contributions = {}
    for player in players:
        contributions[player] = player.anti()
    n_active_players = np.sum([int(p.is_active()) for p in players])
    remaining_players = n_active_players
    
    ## PLAY ##
    someone_led = False
    n_accepted = 0
    for _i in range(lead_index, lead_index + len(players)):
        player = players[_i % len(players)]
        play_randomly = np.random.rand() < p_random
        try:
            remaining_players -= 1
        except:
            print(remaining_players)
            raise UserWarning()
        
        if not player.is_active():
            continue
            
        if not someone_led:
            # see if player should lead
            if remaining_players == 0:
                break
            state = [sum(contributions.values()),
                     remaining_players,
                     player.stack, # TODO: important that this is index 2...
                     player.hand-0.5,
                     torch.rand(1)-0.5 # this makes the decision randomized!
                    ]
            bet = player.lead(state, play_randomly)
            contributions[player] += bet
            if bet > ANTI:
                someone_led = True
                n_active_after_leader = remaining_players
                if lead_recorder is not None:
                    lead_recorder.record_initial(state, player, bet)
            else:
                player.fold()
        else:
            # see if player should follow
            state = [sum(contributions.values()),
                     n_active_after_leader,
                     n_accepted,
                     remaining_players,
                     player.stack,
                     player.hand-0.5,
                     bet]
            response = player.follow(state, play_randomly)
            contributions[player] += response
            if response > 0:
                n_accepted += 1
                if follow_recorder is not None:
                    follow_recorder.record_initial(state, player, response)
            else:
                player.fold()
    
    ## SEE WHO WINS ##
    bet_sizes = np.sort(list(set(contributions.values())))
    j = len(bet_sizes) - 1
    while(True):
        big_players = [p for p, c in contributions.items() if c >= bet_sizes[j]]
        if j > 0:
            side_pot = len(big_players) * (bet_sizes[j] - bet_sizes[j-1])
        else:
            side_pot = len(big_players) * bet_sizes[0]
        winner_index = np.argmax([-1 if p.folded else p.hand for p in big_players])
        for i, p in enumerate(big_players):
            if i == winner_index:
                p.stack += side_pot
            else:
                p.fold()
        if j == 0:
            break
        j -= 1
        
    for player in players:
        if player.verbose:
            print("Player {} had a {}".format(player.id, player.hand.item()))
        
    if lead_recorder is not None:
        lead_recorder.record_final()
    if follow_recorder is not None:
        follow_recorder.record_final()
    

## Define the model

Each of the three models are of the same form. Their outputs either reflect an expected winning or a number that is transformed into an agents bet.

`winnings_model`: 
- (input) state + bet.
- (output) expected winnings.
Attempts to evaluate the expected winnings of the leading better given a game-state and a bet. Trained on the L2 loss of its predictions and actual winning outcomes experienced in-game.

`lead_model`:
- (input) state.
- (output) bet.
Attempts to produce a leading bet that maximizes the output of `winnings_model` when paired with the provided game-state.

`follow_model`:
- (input) state.
- (output) expected winnings if call.
Attempts to evaluate the expected winnings of a caller given a game-state. Trained on the L2 loss of its predictions and actual winning outcomes experienced in-game.

In [125]:
class BetNN(nn.Module):
    def __init__(self, n_state):
        super().__init__()
        n_hidden = 20
        self.fc1 = nn.Linear(n_state, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.out1 = nn.Linear(n_hidden, 1)
        self.out2 = nn.Linear(n_hidden, 1)
        
    def forward(self, state):
        state = torch.Tensor(state)
        h = F.relu(self.fc1(state))
        h = F.relu(self.fc2(h))
        x = self.out1(h)
        pm = F.tanh(self.out2(h))
        return x*pm

lead_model = BetNN(5)
follow_model = BetNN(7)
winnings_model = BetNN(6)



In [126]:
n_players = 6
def get_more_data(n_hands, l_model, f_model):
    lead_recorder = Recorder()
    follow_recorder = Recorder()
    for j in range(n_hands):
        if j % 4 == 0:
            players = [Player(50, follow_model, lead_model) for _ in range(n_players)]
        if j %500 == 0:
            print('{}/{} hands played'.format(j,n_hands))
        start_hand(players, j-1, lead_recorder, follow_recorder, p_random=0.3)
    return(lead_recorder, follow_recorder)


In [127]:
def train_winnings(lead_recorder, model, n_epochs=30, batch_size=30, lr = 0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    criterion = nn.MSELoss()
    model.train()
    for epoch in range(n_epochs):
        x_state, x_bet, y = lead_recorder.get_data()
        
        x_state = torch.cat((x_state,x_bet),dim=1)
        N = x_state.shape[0]
        i = 0
        total_loss = 0
        while i + batch_size - 1 < N:
            inds = range(i, i + batch_size)
            
            winnings = model.forward(x_state[inds,:])

            optimizer.zero_grad()
            loss = criterion(winnings, y[inds,:])
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().item()
            
            i += batch_size
        print("Loss:", total_loss/i)


def train_lead(lead_recorder, l_model, w_model, n_epochs=30, batch_size=30, lr=0.001):
    optimizer = torch.optim.Adam(l_model.parameters(), lr = lr)
    l_model.train()
    for epoch in range(n_epochs):
        x_state, _, y = lead_recorder.get_data(replicate=False)
        
        N = x_state.shape[0]
        i = 0
        total_loss = 0
        while i + batch_size - 1 < N:
            inds = range(i, i + batch_size)
            optimizer.zero_grad()
    
            raw_bet = l_model.forward(x_state[inds,:])
            stack = x_state[inds,2].unsqueeze(1)
            bet = F.softplus(raw_bet) - F.softplus(raw_bet - stack)
            new_state = torch.cat((x_state[inds,:], bet), dim=1)            
            
            loss = -torch.mean(w_model.forward(new_state)) + 0.0005*F.mse_loss(raw_bet, torch.zeros_like(raw_bet) + ANTI)
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().item()
            
            i += batch_size
        print("Loss:", total_loss/i)

        
def train_follow(follow_recorder, f_model, n_epochs=30, batch_size=30, lr=0.001):
    optimizer = torch.optim.Adam(f_model.parameters(), lr = lr)
    criterion = nn.MSELoss()
    f_model.train()
    for epoch in range(n_epochs):
        x_state, _, y = follow_recorder.get_data(replicate=False)
        
        N = x_state.shape[0]
        i = 0
        total_loss = 0
        while i + batch_size - 1 < N:
            inds = range(i, i + batch_size)
    
            winnings = f_model.forward(x_state[inds,:])

            optimizer.zero_grad()
            loss = criterion(winnings, y[inds,:])
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().item()
            
            i += batch_size
        print("Loss:", total_loss/i)


## Train the model

In [None]:
N = 20
bs = 100
ne = 30
lr = 0.001
for _ in range(N):
    print("Getting more data")
    lead_recorder, follow_recorder = get_more_data(10000, lead_model, follow_model)
    for j in range(2):
        print("Training winnings_model")
        train_winnings(lead_recorder, winnings_model, n_epochs=2*ne, batch_size=bs, lr=lr)
        print("Training lead_model")
        train_lead(lead_recorder, lead_model, winnings_model, n_epochs=ne, batch_size=bs, lr=lr)
        print("Training follow_model")
        train_follow(follow_recorder, follow_model, n_epochs=ne, batch_size=bs, lr=lr)

*[Training output suppressed]*

In [129]:
torch.save({'winnings': winnings_model.state_dict(),
            'lead': lead_model.state_dict(),
            'follow': follow_model.state_dict()
           },
           'betting_game_checkpoint.pth')

## Test the model (watch them play)

In [130]:
state_dicts = torch.load('betting_game_checkpoint.pth')
winnings_model.load_state_dict(state_dicts['winnings'])
lead_model.load_state_dict(state_dicts['lead'])
follow_model.load_state_dict(state_dicts['follow'])

<All keys matched successfully>

In [277]:
n_players = 6
follow_model.eval()
lead_model.eval()
players = [Player(50, follow_model, lead_model, verbose=True) for _ in range(n_players)]
start_at = 0

In [279]:
print([p.stack for p in players])
start_hand(players, start_at)
print([p.stack.item() for p in players])
start_at += 1

[55.0, 49, 49, 49, 49, 49]
Player 46844 folds
Player 46845 folds
Player 46846 folds
Player 46847 bet 9.136346817016602
Player 46848 folds
Player 46843 accepts
Player 46847 folds
Player 46843 had a 0.7687220573425293
Player 46844 had a 0.6723493337631226
Player 46845 had a 0.3045359253883362
Player 46846 had a 0.22577524185180664
Player 46847 had a 0.7209641337394714
Player 46848 had a 0.22134077548980713
[69.1363468170166, 48, 48, 48, 38.8636531829834, 48]


*Note: The final fold from Player 46847 is not actually a decision... It is a consequence of the procedure that decides which player wins.*