In [None]:
import numpy as np
import pandas as pd
import polars as pl
import os
import random as rnd
os.chdir('..')
os.chdir('src')
from env import BlackjackEnv


In [34]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1],
    "actions": ["stand", "hit"],
    "num_players": 1
}
# Create environment with 6 decks (standard casino configuration)
env = BlackjackEnv(config=config)


### Initialization of Q table

In [35]:
# Generate relevant states - focusing on decision points that matter
states = []
# For hard totals (no usable ace), only track 12-21
# Below 12, the optimal play is always hit
for player_sum in range(12, 22):
    for dealer_card in range(2, 12):
        states.append((player_sum, dealer_card, 0))  # Hard total

# For soft totals (with usable ace), track 12-21
# Soft totals below 12 are impossible (A+1 = 12)
for player_sum in range(12, 22):
    for dealer_card in range(2, 12):
        states.append((player_sum, dealer_card, 1))  # Soft total

In [36]:

# Initialize Q-table with strategic initial values
q_data = {
    'State': states,
    'Action 0 (Stand)': np.zeros(len(states)),
    'Action 1 (Hit)': np.zeros(len(states))
}

# Strategic initialization: Set high values for "stand" in 20-21, high values for "hit" in 4-11
for i, state in enumerate(states):
    player_sum, _, _ = state
    if player_sum >= 20:
        # For high player sums, initialize stand value higher
        q_data['Action 0 (Stand)'][i] = 0.5
        q_data['Action 1 (Hit)'][i] = -0.1
    elif player_sum < 12:
        # For low player sums, initialize hit value higher
        q_data['Action 0 (Stand)'][i] = -0.1
        q_data['Action 1 (Hit)'][i] = 0.5

Q = pl.DataFrame(q_data)

# Double Q-learning: Second Q-table for reducing bias
Q2 = Q.clone()

In [37]:
# Track state-action visit counts for adaptive learning rates
visit_counts = {}
for state in states:
    visit_counts[(state, 0)] = 0  # Stand
    visit_counts[(state, 1)] = 0  # Hit

### Hyperparameters

In [38]:
# Improved hyperparameters
initial_lr = 0.1             # Learning rate
lr_decay_rate = 0.00005      # Gentler decay rate
gamma = 0.95                 # Higher discount factor - long-term rewards matter more
n_episodes = 200000         # More training episodes
initial_epsilon = 1.0        # Start with 100% exploration
epsilon_min = 0.01           # Minimum exploration rate
epsilon_decay = 0.99995      # Much slower decay rate

### Helper functions Strategy Q-table

In [39]:

# Modified state representation - focusing on meaningful game states
def get_state_features(full_state):
    # Extract just player sum, dealer card, and usable ace
    player_sum = full_state[0]
    dealer_card = full_state[1]
    usable_ace = full_state[2]
    return (player_sum, dealer_card, usable_ace)


In [40]:
def get_adaptive_lr(state, action, base_lr):
    """Get state-action specific learning rate based on visit count"""
    key = (state, action)
    count = visit_counts.get(key, 0) + 1
    # Decay learning rate based on visit count, but maintain a minimum rate
    return max(base_lr / (1 + 0.005 * count), base_lr * 0.1)

In [41]:
def get_q_values(state_features, q_table=Q):
    """Get Q-values for a given state"""
    # Filter the DataFrame for the specific state
    state_row = q_table.filter(pl.col('State') == state_features)
    
    if len(state_row) == 0:
        # Return default values based on player sum
        player_sum = state_features[0]
        if player_sum < 12:
            return np.array([-0.1, 0.5])  # Default to hit for low sums
        elif player_sum >= 20:
            return np.array([0.5, -0.1])  # Default to stand for high sums
        else:
            return np.array([0.0, 0.0])  # Neutral for middle sums
            
    # Extract Q-values from the DataFrame
    stand_val = state_row.select('Action 0 (Stand)').item()
    hit_val = state_row.select('Action 1 (Hit)').item()
    return np.array([stand_val, hit_val])


In [42]:
def update_q_value(state_features, action, reward, next_state_features, lr, q_table=Q, q_table_target=Q2):
    """Update Q-value for state-action pair using Double Q-learning"""
    # Check if state exists in our table
    state_row = q_table.filter(pl.col('State') == state_features)
    if len(state_row) == 0:
        return # State not in our table
    
    # Determine which action column to update
    action_col = 'Action 1 (Hit)' if action == 1 else 'Action 0 (Stand)'
    
    # Current Q-value in the DataFrame
    current_q = state_row.select(action_col).item()
    
    # If next_state_features is None, this is a terminal state
    if next_state_features is None:
        # Terminal state - no future rewards
        new_q = current_q + lr * (reward - current_q)
    else:
        # Get the next state's best action from current Q-table
        next_q_values = get_q_values(next_state_features, q_table)
        best_next_action = np.argmax(next_q_values)
        
        # Get Q-value for best action from target Q-table
        next_q_values_target = get_q_values(next_state_features, q_table_target)
        max_next_q = next_q_values_target[best_next_action]
        
        # Q-learning update formula with future rewards
        new_q = current_q + lr * (reward + gamma * max_next_q - current_q)
    
    # Update the Q-table entry in the DataFrame
    # Create a temporary mask for the state we want to update
    mask = pl.col('State') == state_features
    
    # Use the when/then/otherwise pattern to update values
    q_table = q_table.with_columns(
        pl.when(mask)
        .then(pl.lit(new_q))
        .otherwise(pl.col(action_col))
        .alias(action_col)
    )
    
    # Track visit counts
    visit_counts[(state_features, action)] = visit_counts.get((state_features, action), 0) + 1
    
    return q_table

### Training Q-table

In [43]:
# Training loop with convergence check
print("Starting improved training...")
wins = 0
draws = 0
losses = 0
epsilon = initial_epsilon
lr = initial_lr
money_won = 0
money_lost = 0

# Parameters for convergence
n_episodes = 1000000  # Number of episodes for training
convergence_threshold = 0.001  # Lower threshold for better stability
convergence_check_interval = 10000  # Check for convergence every N episodes
convergence_required_count = 3  # Number of consecutive checks below threshold to confirm convergence
max_episodes = n_episodes  # Maximum episodes as a fallback

# Keep a copy of the previous Q-table for comparison
previous_q = Q.clone()
convergence_count = 0
converged = False
episode = 0
#first training phase only for the Q-table with fixed betting strategy
while episode < max_episodes and not converged:

    env.reset()
    bet_index = env.bet_space.sample()  # Sample bet index from the environment
    bet_amount = env.bets[bet_index]  # Sample bet amount from the environment
    # print(env.step(bet_index, action_type="bet"))
    state, reward, done = env.step(bet_index, action_type="bet")  # Place bet
    if done:
        if reward > 0:
            wins += 1
            money_won += reward * bet_amount
        elif reward == 0:
            draws += 1
        else:
            losses += 1
            money_lost += abs(reward) * bet_amount
    # print(bet_amount)
    state_features = get_state_features(state)

    # Training episode
    while not done:
        
        if state_features[0] < 12:
        # Always hit this state as it's not relevant for our training
            next_state, _, _ = env.step(1, action_type="move")
            next_state_features = get_state_features(next_state) if not done else None
            state = next_state
            state_features = next_state_features if next_state is not None else None
            continue
        
        # Epsilon-greedy action selection
        elif np.random.rand() < epsilon:
            action = env.move_space.sample()  # Random action
        else:
            q_values = get_q_values(state_features)
            action = np.argmax(q_values)  # Greedy action
        
        # Take action
        next_state, reward, done = env.step(action, action_type="move")
        next_state_features = get_state_features(next_state) if not done else None

        # Get adaptive learning rate for this state-action pair
        adaptive_lr = get_adaptive_lr(state_features, action, lr)

        # Randomly decide which Q-table to update (Double Q-learning)
        # print(f"State: {state_features}, Action: {action}, Done: {done}, Reward: {reward}, Next State: {next_state_features}")
        if np.random.rand() < 0.5:
            # print("Updating Q-table 1")
            Q = update_q_value(state_features, action, reward*bet_amount, next_state_features, adaptive_lr, Q, Q2)
        else:
            # print("Updating Q-table 2")
            Q2 = update_q_value(state_features, action, reward*bet_amount, next_state_features, adaptive_lr, Q2, Q)
            
        # Track outcomes
        if done:
            if reward > 0:
                wins += 1
                money_won += reward * bet_amount
            elif reward == 0:
                draws += 1
            else:
                losses += 1
                money_lost += abs(reward) * bet_amount
        
        state = next_state
        state_features = next_state_features if next_state is not None else None
        
        if state_features is None:
            # print(f"Entered break condition with done being {done}")
            break
    
    # Decay epsilon and learning rate
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    lr = initial_lr / (1 + lr_decay_rate * episode)
    
    # Check for convergence periodically
    if episode % convergence_check_interval == 0 and episode > 0:
        # Calculate the maximum absolute difference between current and previous Q-values
        diff_stand = (Q.select('Action 0 (Stand)').to_numpy() - 
                     previous_q.select('Action 0 (Stand)').to_numpy())
        diff_hit = (Q.select('Action 1 (Hit)').to_numpy() - 
                   previous_q.select('Action 1 (Hit)').to_numpy())
        
        max_diff_stand = np.max(np.abs(diff_stand))
        max_diff_hit = np.max(np.abs(diff_hit))
        max_diff = max(max_diff_stand, max_diff_hit)
        
        if max_diff < convergence_threshold:
            convergence_count += 1
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f} (convergence count: {convergence_count}/{convergence_required_count})")
            if convergence_count >= convergence_required_count:
                print(f"Converged after {episode} episodes (max Q-value change: {max_diff:.6f})")
                converged = True
        else:
            convergence_count = 0
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f}")
        
        # Store current Q-values for next comparison
        previous_q = Q.clone()
    
    episode += 1

# Final statistics
total_episodes = episode
print(f"Training complete after {total_episodes} episodes.")
print(f"Win rate: {wins/total_episodes:.4f}")
print(f"Draw rate: {draws/total_episodes:.4f}")
print(f"Loss rate: {losses/total_episodes:.4f}")

Starting improved training...
Episode 10000, max Q-value change: 0.821892
Episode 20000, max Q-value change: 0.517179
Episode 30000, max Q-value change: 0.273300
Episode 40000, max Q-value change: 0.207642
Episode 50000, max Q-value change: 0.219709
Episode 60000, max Q-value change: 0.132908
Episode 70000, max Q-value change: 0.096647
Episode 80000, max Q-value change: 0.104347
Episode 90000, max Q-value change: 0.074383
Episode 100000, max Q-value change: 0.092804
Episode 110000, max Q-value change: 0.061057
Episode 120000, max Q-value change: 0.042384
Episode 130000, max Q-value change: 0.041183
Episode 140000, max Q-value change: 0.047078
Episode 150000, max Q-value change: 0.048959
Episode 160000, max Q-value change: 0.036943
Episode 170000, max Q-value change: 0.031302
Episode 180000, max Q-value change: 0.036392
Episode 190000, max Q-value change: 0.028669
Episode 200000, max Q-value change: 0.024174
Episode 210000, max Q-value change: 0.019015
Episode 220000, max Q-value change

### Testing Q-table win rate

In [44]:

# Evaluate the final policy with more episodes
print("\nFinal policy evaluation...")
eval_wins = 0
eval_draws = 0
eval_loss = 0
money_won = 0
money_lost = 0
eval_episodes = 10000

for _ in range(eval_episodes):
    env.reset()
    bet_index = env.bet_space.sample()  # Sample bet index from the environment
    bet_amount = env.bets[bet_index]  # Sample bet amount from the environment
    # print(env.step(bet_index, action_type="bet"))
    state, reward, done = env.step(bet_index, action_type="bet")  # Place bet
    if done:
        if reward > 0:
            eval_wins += 1
            money_won += reward * bet_amount
        elif reward == 0:
            eval_draws += 1
        else:
            eval_loss += 1
            money_lost += abs(reward) * bet_amount
    # print(bet_amount)
    state_features = get_state_features(state)

    # Training episode
    while not done:
        # Always choose the best action according to average of both Q-tables
        q_values1 = get_q_values(state_features, Q)
        q_values2 = get_q_values(state_features, Q2)
        avg_q_values = (q_values1 + q_values2) / 2
        action = np.argmax(avg_q_values)
        
        next_state, reward, done = env.step(action, action_type="move")
        
        if done and reward > 0:
            eval_wins += 1
            money_won += reward * bet_amount
        elif done and reward == 0:
            eval_draws += 1
        elif done and reward < 0:
            eval_loss += 1
            money_lost += abs(reward) * bet_amount
        else:
            pass
        
        state = next_state
        state_features = get_state_features(state)

        if done:
            break

print(f"Final evaluation complete.")
print(f"Win rate: {eval_wins/eval_episodes:.4f}")
print(f"Draw rate: {eval_draws/eval_episodes:.4f}")
print(f"Loss rate: {eval_loss/eval_episodes:.4f}")
print(f"Money won: {money_won}")
print(f"Money lost: {money_lost}")


Final policy evaluation...
Final evaluation complete.
Win rate: 0.4375
Draw rate: 0.0821
Loss rate: 0.4804
Money won: 4595.0
Money lost: 4804


### Initialization for Bet table

In [45]:
counts = list(range(-5, 6))
bet_amounts = [1,2,5]
bet_data = {
    'True Count': counts,
    'Bet 1' : np.zeros(len(counts)),
    'Bet 2' : np.zeros(len(counts)),
    'Bet 5' : np.zeros(len(counts))
}

for i, count in enumerate(counts):
    for j, bet_amount in enumerate(bet_amounts):
        if count <= -2:
            # Conservative for negative counts
            initial_value = 1.0 if bet_amount == 1 else 0.2
        elif -1 <= count <= 1:
            # Neutral for counts near zero
            initial_value = 0.8 if bet_amount == 1 else (0.5 if bet_amount == 2 else 0.2)
        else:
            # Aggressive for positive counts
            initial_value = 0.2 if bet_amount == 1 else (0.5 if bet_amount == 2 else 1.0)
        bet_data[f'Bet {bet_amount}'][i] = initial_value

bet_Q = pl.DataFrame(bet_data)

### Helpers

In [46]:
def update_bet_Q(bet_Q, true_count, bet_ind, next_true_count, reward, alpha, gamma, bets):
    # Get the current row and bet column
    row = bet_Q.filter(pl.col('True Count') == true_count)
    bet_col = f'Bet {bets[bet_ind]}'
    
    # Get the current Q value
    current_q = row.select(bet_col).item(0, 0)
    
    # Get the next row
    next_row = bet_Q.filter(pl.col('True Count') == next_true_count)
    
    # Get all bet columns
    bet_cols = [f'Bet {bet}' for bet in bets]
    
    # Find the maximum Q value across all bet options for the next state
    next_q_values = next_row.select(bet_cols).to_numpy()[0]
    next_q = np.max(next_q_values)
    
    # print(f"Current Q-value: {current_q}, Next Q-value: {next_q}") if current_q != next_q else None
    
    # Calculate the new Q value
    new_q = current_q + alpha * (reward + gamma * next_q - current_q)
    
    # Update the DataFrame with the new Q-value
    mask = pl.col('True Count') == true_count
    bet_Q = bet_Q.with_columns(
        pl.when(mask)
        .then(pl.lit(new_q))
        .otherwise(pl.col(bet_col))
        .alias(bet_col)
    )
    
    return bet_Q

### Training for Bet table

In [47]:
# Q-learning parameters
n_episodes = 1000000
epsilon = 0.1
alpha = 0.1
gamma = 1.0
bet_sizes = [1, 2, 5]  # low and high bets

config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": bet_sizes,
    "actions": ["stand", "hit"],
    "num_players": 1
}
env = BlackjackEnv(config)
episode = 0
wins = 0
draws = 0
losses = 0
money_won = 0
money_lost = 0

while episode < n_episodes:
    obs = env.reset()
    true_count = obs[3]
    #discretize the true count to the range of the bet table
    true_count = int(np.clip(true_count, -5, 5))
    if np.random.rand() < epsilon:
        bet_ind = rnd.choice(range(env.bet_space.n))
    else:
        # Get the current row for the true count
        row_data = bet_Q.filter(pl.col('True Count') == true_count)
        # Get all bet columns
        bet_cols = [f'Bet {bet}' for bet in env.bets]
        # Get Q values for all bets at this count
        q_values = row_data.select(bet_cols).to_numpy()[0]
        # Choose the bet with the highest Q value
        bet_ind = np.argmax(q_values)
    bet = env.bets[bet_ind]
    state, reward, done = env.step(bet_ind, action_type="bet")
    next_true_count = state[3]
    #discretize the true count to the range of the bet table
    next_true_count = int(np.clip(next_true_count, -5, 5))
    state_features = get_state_features(state)
    if done:
        bet_Q = update_bet_Q(bet_Q, true_count, bet_ind, next_true_count, reward, alpha, gamma, env.bets)
        if reward > 0:
            wins += 1
            money_won += reward * bet
        elif reward == 0:
            draws += 1
        else:
            losses += 1
            money_lost += abs(reward) * bet
    while not done:
        if state_features[0] < 12:
            next_state, _, _ = env.step(1, action_type="move")
            next_state_features = get_state_features(next_state) if not done else None
            state = next_state
            state_features = next_state_features if next_state is not None else None
            continue
        
        q_values1 = get_q_values(state_features, Q)
        q_values2 = get_q_values(state_features, Q2)
        avg_q_values = (q_values1 + q_values2) / 2
        action = np.argmax(avg_q_values)
        next_state, reward, done = env.step(action, action_type="move")
        next_state_features = get_state_features(next_state) if not done else None
        if done:
            next_true_count = next_state[3]
            #discretize the true count to the range of the bet table
            next_true_count = int(np.clip(next_true_count, -5, 5))
            bet_Q = update_bet_Q(bet_Q, true_count, bet_ind, next_true_count, reward, alpha, gamma, env.bets)
            if reward > 0:
                wins += 1
                money_won += reward * bet
            elif reward == 0:
                draws += 1
            else:
                losses += 1
                money_lost += abs(reward) * bet
                
        state = next_state
        state_features = next_state_features if next_state is not None else None
    
    #make some prints to  see the progress of the training
    if episode % 10000 == 0:
        # print something about how the bet table is doing
        print(f"Episode {episode}, Bet Q-values: {bet_Q.to_numpy()}")
    
    episode += 1


print(f"Final evaluation complete.")
print(f"Win rate: {wins/n_episodes:.4f}")
print(f"Draw rate: {draws/n_episodes:.4f}")
print(f"Loss rate: {losses/n_episodes:.4f}")
print(f"Money won: {money_won}")
print(f"Money lost: {money_lost}")
print(f"Net profit: {money_won - money_lost}")

Episode 0, Bet Q-values: [[-5.   1.   0.2  0.2]
 [-4.   1.   0.2  0.2]
 [-3.   1.   0.2  0.2]
 [-2.   1.   0.2  0.2]
 [-1.   0.8  0.5  0.2]
 [ 0.   0.7  0.5  0.2]
 [ 1.   0.8  0.5  0.2]
 [ 2.   0.2  0.5  1. ]
 [ 3.   0.2  0.5  1. ]
 [ 4.   0.2  0.5  1. ]
 [ 5.   0.2  0.5  1. ]]
Episode 10000, Bet Q-values: [[-5.          1.          0.2         0.2       ]
 [-4.          1.          0.2         0.2       ]
 [-3.          0.9         0.2         0.2       ]
 [-2.          0.93468177  0.33057341  0.2       ]
 [-1.          0.65175916  0.86413177  1.63230018]
 [ 0.          2.11519182  1.98293817  2.35943286]
 [ 1.          1.79885741  2.24780836  1.66278287]
 [ 2.          0.89229428  0.63505757  1.81844587]
 [ 3.          0.14805132  0.89611576  1.3501385 ]
 [ 4.          0.2         0.5         0.85823662]
 [ 5.          0.2         0.5         0.91      ]]
Episode 20000, Bet Q-values: [[-5.          1.          0.2         0.2       ]
 [-4.          1.          0.2         0.2       ]

### Testing Q-table

In [91]:
# Test the final policy with the bet table
print("\nFinal policy evaluation with bet table...")
eval_wins = 0
eval_draws = 0
eval_loss = 0
money_won = 0
money_lost = 0
benchmark_money_won = 0
benchmark_money_lost = 0
eval_episodes = 10000
bet_sizes = [1, 2, 5]  # low and high bets
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": bet_sizes,
    "actions": ["stand", "hit"],
    "num_players": 1
}
env = BlackjackEnv(config)
episode = 0
while episode < eval_episodes:
    obs = env.reset()
    true_count = obs[3]
    #discretize the true count to the range of the bet table
    true_count = int(np.clip(true_count, -5, 5))
    bet_ind = np.argmax(bet_Q.filter(pl.col('True Count') == true_count).select([f'Bet {bet}' for bet in env.bets]).to_numpy()[0])
    bet = env.bets[bet_ind]
    state, reward, done = env.step(bet_ind, action_type="bet")
    next_true_count = state[3]
    #discretize the true count to the range of the bet table
    next_true_count = int(np.clip(next_true_count, -5, 5))
    state_features = get_state_features(state)
    if done:
        if reward > 0:
            eval_wins += 1
            money_won += reward * bet
            benchmark_money_won += reward * 1
        elif reward == 0:
            eval_draws += 1
        else:
            eval_loss += 1
            money_lost += abs(reward) * bet
            benchmark_money_lost += abs(reward) * 1
    while not done:
        q_values1 = get_q_values(state_features, Q)
        q_values2 = get_q_values(state_features, Q2)
        avg_q_values = (q_values1 + q_values2) / 2
        action = np.argmax(avg_q_values)
        next_state, reward, done = env.step(action, action_type="move")
        next_state_features = get_state_features(next_state) if not done else None
        if done:
            next_true_count = next_state[3]
            #discretize the true count to the range of the bet table
            next_true_count = int(np.clip(next_true_count, -5, 5))
            if reward > 0:
                eval_wins += 1
                money_won += reward * bet
                benchmark_money_won += reward * 1
            elif reward == 0:
                eval_draws += 1
            else:
                eval_loss += 1
                money_lost += abs(reward) * bet
                benchmark_money_lost += abs(reward) * 1
        
        state = next_state
        state_features = next_state_features if next_state is not None else None
    
    episode += 1


print(f"Final evaluation complete.")
print(f"Win rate: {eval_wins/eval_episodes:.4f}")
print(f"Draw rate: {eval_draws/eval_episodes:.4f}")
print(f"Loss rate: {eval_loss/eval_episodes:.4f}")
print(f"Money won: {money_won}")
print(f"Money lost: {money_lost}")
print(f"Net profit: {money_won - money_lost}")
# print(f"Benchmark Money won: {benchmark_money_won}")
# print(f"Benchmark Money lost: {benchmark_money_lost}")
print(f"Benchmark Net profit: {benchmark_money_won - benchmark_money_lost}")


Final policy evaluation with bet table...
Final evaluation complete.
Win rate: 0.4400
Draw rate: 0.0822
Loss rate: 0.4778
Money won: 22318.0
Money lost: 23122
Net profit: -804.0
Benchmark Net profit: -160.5


In [49]:
benchmark_bet = {
    -5: {1 : 1.0, 2 : 0.2, 5 : 0.2},
    -4: {1 : 1.0, 2 : 0.2, 5 : 0.2},
    -3: {1 : 1.0, 2 : 0.2, 5 : 0.2},
    -2: {1 : 1.0, 2 : 0.5, 5 : 0.2},
    -1: {1 : 0.8, 2 : 0.5, 5 : 0.2},
     0: {1 : 0.8, 2 : 0.5, 5 : 0.2},
     1: {1 : 0.8, 2 : 0.5, 5 : 0.2},
     2: {1 : 0.5, 2 : 0.8, 5 : 0.2},
     3: {1 : 0.5, 2 : 0.8, 5 : 0.2},
     4: {1 : 0.2, 2 : 0.5, 5 : 1.0},
     5: {1 : 0.2, 2 : 0.8, 5 : 1.0}
}