In [20]:
import numpy as np
import pandas as pd
import polars as pl
import os
#from file env.py in directory src (you need to change directory)
os.chdir('..')
os.chdir('src')
from env import BlackjackEnv


In [21]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1],
    "actions": ["stand", "hit"],
    "num_players": 1
}
# Create environment with 6 decks (standard casino configuration)
env = BlackjackEnv(config=config)


In [22]:
# Generate relevant states - focusing on decision points that matter
states = []
# For hard totals (no usable ace), only track 12-21
# Below 12, the optimal play is always hit
for player_sum in range(12, 22):
    for dealer_card in range(1, 11):
        states.append((player_sum, dealer_card, 0))  # Hard total

# For soft totals (with usable ace), track 12-21
# Soft totals below 12 are impossible (A+1 = 12)
for player_sum in range(12, 22):
    for dealer_card in range(1, 11):
        states.append((player_sum, dealer_card, 1))  # Soft total

In [49]:

# Initialize Q-table with strategic initial values
q_data = {
    'State': states,
    'Action 0 (Stand)': np.zeros(len(states)),
    'Action 1 (Hit)': np.zeros(len(states))
}

# Strategic initialization: Set high values for "stand" in 20-21, high values for "hit" in 4-11
for i, state in enumerate(states):
    player_sum, _, _ = state
    if player_sum >= 20:
        # For high player sums, initialize stand value higher
        q_data['Action 0 (Stand)'][i] = 0.5
        q_data['Action 1 (Hit)'][i] = -0.1
    elif player_sum < 12:
        # For low player sums, initialize hit value higher
        q_data['Action 0 (Stand)'][i] = -0.1
        q_data['Action 1 (Hit)'][i] = 0.5

Q = pl.DataFrame(q_data)

# Double Q-learning: Second Q-table for reducing bias
Q2 = Q.clone()

In [None]:
count_ranges = [[0], [1,2,3], [4]]

count_data = {
    'True Count' : count_ranges,
    'Low Bet (1)' : np.zeros(len(count_ranges)),
    'Mid Bet (2)' : np.zeros(len(count_ranges)),
    'High Bet (5)' : np.zeros(len(count_ranges))
}
# Initialize betting strategy for each count range, which are fixed in all matches
for i, count_range in enumerate(count_ranges):
    if count_range == [0]:
        count_data['Low Bet (1)'][i] = 0.5
        count_data['Mid Bet (2)'][i] = 0.1
        count_data['High Bet (5)'][i] = 0.1
    elif count_range == [1,2,3]:
        count_data['Low Bet (1)'][i] = 0.2
        count_data['Mid Bet (2)'][i] = 0.5
        count_data['High Bet (5)'][i] = 0.2
    else:
        count_data['Low Bet (1)'][i] = 0.2
        count_data['Mid Bet (2)'][i] = 0.3
        count_data['High Bet (5)'][i] = 0.5


count_df = pl.DataFrame(count_data)

# use a dictionary which associates each column name of the type of bet to the actual bet value
betting_strategy = {
    'Low Bet (1)': 1,
    'Mid Bet (2)': 2,
    'High Bet (5)': 5
}

# Track state-action visit counts for adaptive learning rates
visit_counts = {}
for state in states:
    visit_counts[(state, 0)] = 0  # Stand
    visit_counts[(state, 1)] = 0  # Hit

In [25]:

# Improved hyperparameters
initial_lr = 0.1             # Learning rate
lr_decay_rate = 0.00005      # Gentler decay rate
gamma = 0.95                 # Higher discount factor - long-term rewards matter more
n_episodes = 200000         # More training episodes
initial_epsilon = 1.0        # Start with 100% exploration
epsilon_min = 0.01           # Minimum exploration rate
epsilon_decay = 0.99995      # Much slower decay rate

In [26]:

# Modified state representation - focusing on meaningful game states
def get_state_features(full_state):
    # Extract just player sum, dealer card, and usable ace
    return (full_state[0], full_state[1], full_state[2])


In [27]:
def get_adaptive_lr(state, action, base_lr):
    """Get state-action specific learning rate based on visit count"""
    key = (state, action)
    count = visit_counts.get(key, 0) + 1
    # Decay learning rate based on visit count, but maintain a minimum rate
    return max(base_lr / (1 + 0.005 * count), base_lr * 0.1)

In [28]:
def get_q_values(state_features, q_table=Q):
    """Get Q-values for a given state"""
    # Filter the DataFrame for the specific state
    state_row = q_table.filter(pl.col('State') == state_features)
    
    if len(state_row) == 0:
        # Return default values based on player sum
        player_sum = state_features[0]
        if player_sum < 12:
            return np.array([-0.1, 0.5])  # Default to hit for low sums
        elif player_sum >= 20:
            return np.array([0.5, -0.1])  # Default to stand for high sums
        else:
            return np.array([0.0, 0.0])  # Neutral for middle sums
            
    # Extract Q-values from the DataFrame
    stand_val = state_row.select('Action 0 (Stand)').item()
    hit_val = state_row.select('Action 1 (Hit)').item()
    return np.array([stand_val, hit_val])


In [48]:
def update_q_value(state_features, action, reward, next_state_features, lr, q_table=Q, q_table_target=Q2):
    """Update Q-value for state-action pair using Double Q-learning"""
    # Check if state exists in our table
    state_row = q_table.filter(pl.col('State') == state_features)
    if len(state_row) == 0:
        return  # State not in our table
    
    # Determine which action column to update
    action_col = 'Action 1 (Hit)' if action == 1 else 'Action 0 (Stand)'
    
    # Current Q-value in the DataFrame
    current_q = state_row.select(action_col).item()
    
    # If next_state_features is None, this is a terminal state
    if next_state_features is None:
        # Terminal state - no future rewards
        new_q = current_q + lr * (reward - current_q)
    else:
        # Get the next state's best action from current Q-table
        next_q_values = get_q_values(next_state_features, q_table)
        best_next_action = np.argmax(next_q_values)
        
        # Get Q-value for best action from target Q-table
        next_q_values_target = get_q_values(next_state_features, q_table_target)
        max_next_q = next_q_values_target[best_next_action]
        
        # Q-learning update formula with future rewards
        new_q = current_q + lr * (reward + gamma * max_next_q - current_q)
    
    print(f"Updating Q-value for state {state_features}, action {action}, reward {reward}, new Q-value: {new_q}")

    # Update the Q-table entry in the DataFrame
    # Create a temporary mask for the state we want to update
    mask = pl.col('State') == state_features
    
    # Use the when/then/otherwise pattern to update values
    q_table = q_table.with_columns(
        pl.when(mask)
        .then(pl.lit(new_q))
        .otherwise(pl.col(action_col))
        .alias(action_col)
    )
    print(f"Updated Q-table for state {state_features}: {q_table.filter(mask)}")
    
    # Track visit counts
    visit_counts[(state_features, action)] = visit_counts.get((state_features, action), 0) + 1
    
    return q_table

In [30]:
def update_count_table(current_count, reward, count_df=count_df):
    """Update the value associated with the move of betting a certain amount based on the reward received"""
    # Check if reward is positive or negative
    if reward > 0:
        # Positive reward - increase the value of betting a higher amount and decrease the value of betting a lower amount for the row corresponding to the current count
        # Find the index of the current count range
        index = None
        for i, count_range in enumerate(count_ranges):
            if current_count in count_range:
                index = i
                break
        # update the row corresponding to the current count
        if index is None:
            raise ValueError("Current count not found in count ranges")
        # Update the values in the count_df DataFrame
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['Low Bet (1)'][index] * 0.9))
            .otherwise(pl.col('Low Bet (1)'))
            .alias('Low Bet (1)')
        )
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['Mid Bet (2)'][index] * 0.95))
            .otherwise(pl.col('Mid Bet (2)'))
            .alias('Mid Bet (2)')
        )
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['High Bet (5)'][index] * 1.05))
            .otherwise(pl.col('High Bet (5)'))
            .alias('High Bet (5)')
        )
    elif reward < 0:
        # Negative reward - decrease the value of betting a higher amount and increase the value of betting a lower amount for the row corresponding to the current count
        # Find the index of the current count range
        index = None
        for i, count_range in enumerate(count_ranges):
            if current_count in count_range:
                index = i
                break
        # update the row corresponding to the current count
        if index is None:
            raise ValueError("Current count not found in count ranges")
        # Update the values in the count_df DataFrame
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['Low Bet (1)'][index] * 1.05))
            .otherwise(pl.col('Low Bet (1)'))
            .alias('Low Bet (1)')
        )
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['Mid Bet (2)'][index] * 0.95))
            .otherwise(pl.col('Mid Bet (2)'))
            .alias('Mid Bet (2)')
        )
        count_df = count_df.with_columns(
            pl.when(pl.col('True Count') == count_ranges[index])
            .then(pl.lit(count_df['High Bet (5)'][index] * 0.9))
            .otherwise(pl.col('High Bet (5)'))
            .alias('High Bet (5)')
        )
    else:
        #do nothing
        #ALTERNATIVE: update something
        pass
    

    
    return count_df

In [50]:
# Training loop with convergence check
print("Starting improved training...")
wins = 0
draws = 0
losses = 0
epsilon = initial_epsilon
lr = initial_lr
money_won = 0
money_lost = 0

# Parameters for convergence
n_episodes = 1000  # Number of episodes for training
convergence_threshold = 0.001  # Lower threshold for better stability
convergence_check_interval = 10000  # Check for convergence every N episodes
convergence_required_count = 3  # Number of consecutive checks below threshold to confirm convergence
max_episodes = n_episodes  # Maximum episodes as a fallback

# Keep a copy of the previous Q-table for comparison
previous_q = Q.clone()
convergence_count = 0
converged = False
episode = 0
#first training phase only for the Q-table with fixed betting strategy
while episode < max_episodes and not converged:

    env.reset()
    bet_index = env.bet_space.sample()  # Sample bet index from the environment
    bet_amount = env.bets[bet_index]  # Sample bet amount from the environment
    state, reward, done, _ = env.step(bet_index, action_type="bet")  # Place bet
    # print(bet_amount)
    state_features = get_state_features(state)

    # Training episode
    while not done:
        
        if state_features[0] < 12:
        # Always hit this state as it's not relevant for our training
            next_state, _, _, _ = env.step(1, action_type="move")
            next_state_features = get_state_features(next_state) if not done else None
            state = next_state
            state_features = next_state_features if next_state is not None else None
            continue
        
        # Epsilon-greedy action selection
        elif np.random.rand() < epsilon:
            action = env.move_space.sample()  # Random action
        else:
            q_values = get_q_values(state_features)
            action = np.argmax(q_values)  # Greedy action
        
        # Take action
        next_state, reward, done, _ = env.step(action, action_type="move")
        next_state_features = get_state_features(next_state) if not done else None

        # Get adaptive learning rate for this state-action pair
        adaptive_lr = get_adaptive_lr(state_features, action, lr)

        # Randomly decide which Q-table to update (Double Q-learning)
        # print(f"State: {state_features}, Action: {action}, Done: {done}, Reward: {reward}, Next State: {next_state_features}")
        if np.random.rand() < 0.5:
            print("Updating Q-table 1")
            Q = update_q_value(state_features, action, reward*bet_amount, next_state_features, adaptive_lr, Q, Q2)
        else:
            print("Updating Q-table 2")
            Q2 = update_q_value(state_features, action, reward*bet_amount, next_state_features, adaptive_lr, Q2, Q)
            
        # Track outcomes
        if done:
            if reward > 0:
                wins += 1
                money_won += reward * bet_amount
            elif reward == 0:
                draws += 1
            else:
                losses += 1
                money_lost += abs(reward) * bet_amount
        
        state = next_state
        state_features = next_state_features if next_state is not None else None
        
        if state_features is None:
            break
    
    # Decay epsilon and learning rate
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    lr = initial_lr / (1 + lr_decay_rate * episode)
    
    # Check for convergence periodically
    if episode % convergence_check_interval == 0 and episode > 0:
        # Calculate the maximum absolute difference between current and previous Q-values
        diff_stand = (Q.select('Action 0 (Stand)').to_numpy() - 
                     previous_q.select('Action 0 (Stand)').to_numpy())
        diff_hit = (Q.select('Action 1 (Hit)').to_numpy() - 
                   previous_q.select('Action 1 (Hit)').to_numpy())
        
        max_diff_stand = np.max(np.abs(diff_stand))
        max_diff_hit = np.max(np.abs(diff_hit))
        max_diff = max(max_diff_stand, max_diff_hit)
        
        if max_diff < convergence_threshold:
            convergence_count += 1
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f} (convergence count: {convergence_count}/{convergence_required_count})")
            if convergence_count >= convergence_required_count:
                print(f"Converged after {episode} episodes (max Q-value change: {max_diff:.6f})")
                converged = True
        else:
            convergence_count = 0
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f}")
        
        # Store current Q-values for next comparison
        previous_q = Q.clone()
    
    episode += 1

# Final statistics
total_episodes = episode
print(f"Training complete after {total_episodes} episodes.")
print(f"Win rate: {wins/total_episodes:.4f}")
print(f"Draw rate: {draws/total_episodes:.4f}")
print(f"Loss rate: {losses/total_episodes:.4f}")

Starting improved training...
Updating Q-table 2
Updating Q-value for state (20, 10, 0), action 1, reward -1, new Q-value: -0.1886699507389163
Updated Q-table for state (20, 10, 0): shape: (1, 3)
┌─────────────┬──────────────────┬────────────────┐
│ State       ┆ Action 0 (Stand) ┆ Action 1 (Hit) │
│ ---         ┆ ---              ┆ ---            │
│ list[i64]   ┆ f64              ┆ f64            │
╞═════════════╪══════════════════╪════════════════╡
│ [20, 10, 0] ┆ 0.5              ┆ -0.18867       │
└─────────────┴──────────────────┴────────────────┘
Updating Q-table 2
Updating Q-value for state (13, 10, 0), action 0, reward -1, new Q-value: -0.09852216748768475
Updated Q-table for state (13, 10, 0): shape: (1, 3)
┌─────────────┬──────────────────┬────────────────┐
│ State       ┆ Action 0 (Stand) ┆ Action 1 (Hit) │
│ ---         ┆ ---              ┆ ---            │
│ list[i64]   ┆ f64              ┆ f64            │
╞═════════════╪══════════════════╪════════════════╡
│ [13, 10, 0]

AttributeError: 'NoneType' object has no attribute 'filter'

In [None]:

# Evaluate the final policy with more episodes
print("\nFinal policy evaluation...")
eval_wins = 0
eval_draws = 0
eval_episodes = 100000

for _ in range(eval_episodes):
    state, _ = env.reset()
    state_features = get_state_features(state)
    done = False
    
    while not done:
        # Always choose the best action according to average of both Q-tables
        q_values1 = get_q_values(state_features, Q)
        q_values2 = get_q_values(state_features, Q2)
        avg_q_values = (q_values1 + q_values2) / 2
        action = np.argmax(avg_q_values)
        
        next_state, reward, done, _, _ = env.step(action)
        
        if done and reward > 0:
            eval_wins += 1
        elif done and reward == 0:
            eval_draws += 1
        
        state = next_state
        state_features = get_state_features(state)

print(f"Final evaluation complete.")
print(f"Win rate: {eval_wins/eval_episodes:.4f}")
print(f"Draw rate: {eval_draws/eval_episodes:.4f}")



Final policy evaluation...
Final evaluation complete.
Win rate: 0.4271
Draw rate: 0.0941


In [None]:

# Display policy for critical decision points
print("\nLearned Policy (Player Sum vs Dealer Card):")
print("Player Sum | Dealer's Card | Usable Ace | Best Action | Q(stand) | Q(hit)")
print("-" * 75)

for player_sum in [12, 13, 14, 15, 16, 17, 18, 19, 20]:
    for dealer_card in [1, 6, 10]:  # Dealer showing Ace, 6, 10
        for usable_ace in [0, 1]:    # Hard and soft totals
            state = (player_sum, dealer_card, usable_ace)
            q_values1 = get_q_values(state, Q)
            q_values2 = get_q_values(state, Q2)
            avg_q_values = (q_values1 + q_values2) / 2
            best_action = "Hit" if np.argmax(avg_q_values) == 1 else "Stand"
            print(f"{player_sum:10d} | {dealer_card:12d} | {usable_ace:10d} | {best_action:10s} | {avg_q_values[0]:7.4f} | {avg_q_values[1]:7.4f}")



Learned Policy (Player Sum vs Dealer Card):
Player Sum | Dealer's Card | Usable Ace | Best Action | Q(stand) | Q(hit)
---------------------------------------------------------------------------
        12 |            1 |          0 | Hit        | -0.6659 | -0.4954
        12 |            1 |          1 | Hit        | -0.1234 | -0.1141
        12 |            6 |          0 | Stand      | -0.1501 | -0.2494
        12 |            6 |          1 | Hit        | -0.0015 |  0.0942
        12 |           10 |          0 | Hit        | -0.5323 | -0.4184
        12 |           10 |          1 | Hit        | -0.1748 | -0.1282
        13 |            1 |          0 | Hit        | -0.7091 | -0.5380
        13 |            1 |          1 | Stand      | -0.1076 | -0.1474
        13 |            6 |          0 | Stand      | -0.1514 | -0.3095
        13 |            6 |          1 | Hit        |  0.0183 |  0.1244
        13 |           10 |          0 | Hit        | -0.6062 | -0.4541
        13 | 

In [None]:
# Calculate average Q-values (ensemble approach)
avg_Q = Q.copy()
avg_Q['Action 0 (Stand)'] = (Q['Action 0 (Stand)'] + Q2['Action 0 (Stand)']) / 2
avg_Q['Action 1 (Hit)'] = (Q['Action 1 (Hit)'] + Q2['Action 1 (Hit)']) / 2
avg_Q['Best Action'] = avg_Q.apply(
    lambda row: "Stand" if row['Action 0 (Stand)'] > row['Action 1 (Hit)'] else "Hit", 
    axis=1
)

In [None]:
Q.to_csv('blackjack_q_table1_cpu.csv', index=False)
Q2.to_csv('blackjack_q_table2_cpu.csv', index=False)
avg_Q.to_csv('blackjack_avg_q_table_cpu.csv', index=False)

In [None]:

# Close environment
env.close()
