In [2]:
import numpy as np
import pandas as pd
from multi_deck_env import BlackjackMultiDeckEnv


In [3]:

# Create environment with 6 decks (standard casino configuration)
env = BlackjackMultiDeckEnv(num_decks=6)


In [4]:

# Modified state representation - focusing on meaningful game states
def get_state_features(full_state):
    # Extract just player sum, dealer card, and usable ace
    return (full_state[0], full_state[1], full_state[2])


In [5]:

# Generate relevant states - focusing on decision points that matter
states = []
# For hard totals (no usable ace), only track 12-21
# Below 12, the optimal play is always hit
for player_sum in range(12, 22):
    for dealer_card in range(1, 11):
        states.append((player_sum, dealer_card, 0))  # Hard total

# For soft totals (with usable ace), track 12-21
# Soft totals below 12 are impossible (A+1 = 12)
for player_sum in range(12, 22):
    for dealer_card in range(1, 11):
        states.append((player_sum, dealer_card, 1))  # Soft total

# For completeness, include states 4-11 which always hit
for player_sum in range(4, 12):
    for dealer_card in range(1, 11):
        for usable_ace in [0, 1]:
            # Only add if it's a valid state (some combinations aren't possible)
            if not (usable_ace == 1 and player_sum < 12):
                states.append((player_sum, dealer_card, usable_ace))


In [6]:

# Initialize Q-table with strategic initial values
q_data = {
    'State': states,
    'Action 0 (Stand)': np.zeros(len(states)),
    'Action 1 (Hit)': np.zeros(len(states))
}

# Strategic initialization: Set high values for "stand" in 20-21, high values for "hit" in 4-11
for i, state in enumerate(states):
    player_sum, _, _ = state
    if player_sum >= 20:
        # For high player sums, initialize stand value higher
        q_data['Action 0 (Stand)'][i] = 0.5
        q_data['Action 1 (Hit)'][i] = -0.1
    elif player_sum < 12:
        # For low player sums, initialize hit value higher
        q_data['Action 0 (Stand)'][i] = -0.1
        q_data['Action 1 (Hit)'][i] = 0.5

Q = pd.DataFrame(q_data)

# Double Q-learning: Second Q-table for reducing bias
Q2 = Q.copy()


In [7]:

# Improved hyperparameters
initial_lr = 0.1             # Learning rate
lr_decay_rate = 0.00005      # Gentler decay rate
gamma = 0.95                 # Higher discount factor - long-term rewards matter more
n_episodes = 1000000         # More training episodes
initial_epsilon = 1.0        # Start with 100% exploration
epsilon_min = 0.01           # Minimum exploration rate
epsilon_decay = 0.99995      # Much slower decay rate

# Track state-action visit counts for adaptive learning rates
visit_counts = {}
for state in states:
    visit_counts[(state, 0)] = 0  # Stand
    visit_counts[(state, 1)] = 0  # Hit


In [14]:

def get_adaptive_lr(state, action, base_lr):
    """Get state-action specific learning rate based on visit count"""
    key = (state, action)
    count = visit_counts.get(key, 0) + 1
    # Decay learning rate based on visit count, but maintain a minimum rate
    return max(base_lr / (1 + 0.005 * count), base_lr * 0.1)

def get_q_values(state_features, q_table=Q):
    """Get Q-values for a given state"""
    state_row = q_table[q_table['State'] == state_features]
    if len(state_row) == 0:
        # Return default values based on player sum
        player_sum = state_features[0]
        if player_sum < 12:
            return [-0.1, 0.5]  # Default to hit for low sums
        elif player_sum >= 20:
            return [0.5, -0.1]  # Default to stand for high sums
        else:
            return [0.0, 0.0]  # Neutral for middle sums
    return state_row[['Action 0 (Stand)', 'Action 1 (Hit)']].values[0]

def update_q_value(state_features, action, reward, next_state_features, lr, q_table=Q, q_table_target=Q2):
    """Update Q-value for state-action pair using Double Q-learning"""
    # Find the row for the current state
    state_idx = q_table.index[q_table['State'] == state_features].tolist()
    if not state_idx:
        return  # State not in our table
    
    # Determine which action column to update
    action_col = 'Action 1 (Hit)' if action == 1 else 'Action 0 (Stand)'
    
    # Current Q-value
    current_q = q_table.loc[state_idx[0], action_col]
    
    # If next_state_features is None, this is a terminal state
    if next_state_features is None:
        # Terminal state - no future rewards
        new_q = current_q + lr * (reward - current_q)
    else:
        # Get the next state's best action from current Q-table
        next_q_values = get_q_values(next_state_features, q_table)
        best_next_action = np.argmax(next_q_values)
        
        # Get Q-value for best action from target Q-table
        next_q_values_target = get_q_values(next_state_features, q_table_target)
        max_next_q = next_q_values_target[best_next_action]
        
        # Q-learning update formula with future rewards
        new_q = current_q + lr * (reward + gamma * max_next_q - current_q)
    
    # Update the Q-table
    q_table.loc[state_idx[0], action_col] = new_q
    
    # Track visit counts
    visit_counts[(state_features, action)] = visit_counts.get((state_features, action), 0) + 1


In [15]:

# Training loop with convergence check
print("Starting improved training...")
wins = 0
draws = 0
losses = 0
epsilon = initial_epsilon
lr = initial_lr

# Parameters for convergence
convergence_threshold = 0.0005  # Lower threshold for better stability
convergence_check_interval = 10000  # Check for convergence every N episodes
convergence_required_count = 3  # Number of consecutive checks below threshold to confirm convergence
max_episodes = n_episodes  # Maximum episodes as a fallback

# Keep a copy of the previous Q-table for comparison
previous_q = Q.copy()
convergence_count = 0
converged = False
episode = 0

# Add simple evaluation every 50k episodes
eval_intervals = 50000
last_win_rate = 0

while episode < max_episodes and not converged:
    state, _ = env.reset()
    state_features = get_state_features(state)
    done = False
    
    # Progress reporting
    if episode % 10000 == 0:
        print(f"Episode {episode}, epsilon: {epsilon:.4f}, lr: {lr:.6f}, wins: {wins}, draws: {draws}, losses: {losses}")
    
    # Run quick evaluation every eval_intervals episodes
    if episode % eval_intervals == 0 and episode > 0:
        eval_wins = 0
        eval_draws = 0
        eval_episodes = 10000
        
        for _ in range(eval_episodes):
            eval_state, _ = env.reset()
            eval_state_features = get_state_features(eval_state)
            eval_done = False
            
            while not eval_done:
                # Always choose the best action
                q_values = get_q_values(eval_state_features)
                eval_action = np.argmax(q_values)
                eval_next_state, eval_reward, eval_done, _, _ = env.step(eval_action)
                
                if eval_done and eval_reward > 0:
                    eval_wins += 1
                elif eval_done and eval_reward == 0:
                    eval_draws += 1
                
                eval_state = eval_next_state
                eval_state_features = get_state_features(eval_state)
        
        win_rate = eval_wins / eval_episodes
        draw_rate = eval_draws / eval_episodes
        print(f"EVALUATION: Win rate: {win_rate:.4f}, Draw rate: {draw_rate:.4f}")
        
        # Check if win rate improvement has plateaued
        if abs(win_rate - last_win_rate) < 0.001 and episode > 300000:
            print(f"Win rate improvement has plateaued at {win_rate:.4f}")
            # Optionally break if no improvement
            # converged = True
        
        last_win_rate = win_rate
    
    # Training episode
    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            q_values = get_q_values(state_features)
            action = np.argmax(q_values)  # Greedy action
        
        # Take action
        next_state, reward, done, _, _ = env.step(action)
        next_state_features = get_state_features(next_state) if not done else None

        # Get adaptive learning rate for this state-action pair
        adaptive_lr = get_adaptive_lr(state_features, action, lr)

        # Randomly decide which Q-table to update (Double Q-learning)
        if np.random.rand() < 0.5:
            update_q_value(state_features, action, reward, next_state_features, adaptive_lr, Q, Q2)
        else:
            update_q_value(state_features, action, reward, next_state_features, adaptive_lr, Q2, Q)
        # Track outcomes
        if done:
            if reward > 0:
                wins += 1
            elif reward == 0:
                draws += 1
            else:
                losses += 1
        
        state = next_state
        state_features = next_state_features if next_state is not None else None
    
    # Decay epsilon and learning rate
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    lr = initial_lr / (1 + lr_decay_rate * episode)
    
    # Check for convergence periodically
    if episode % convergence_check_interval == 0 and episode > 0:
        # Calculate the maximum absolute difference between current and previous Q-values
        max_diff_stand = np.max(np.abs(Q['Action 0 (Stand)'] - previous_q['Action 0 (Stand)']))
        max_diff_hit = np.max(np.abs(Q['Action 1 (Hit)'] - previous_q['Action 1 (Hit)']))
        max_diff = max(max_diff_stand, max_diff_hit)
        
        if max_diff < convergence_threshold:
            convergence_count += 1
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f} (convergence count: {convergence_count}/{convergence_required_count})")
            if convergence_count >= convergence_required_count:
                print(f"Converged after {episode} episodes (max Q-value change: {max_diff:.6f})")
                converged = True
        else:
            convergence_count = 0
            print(f"Episode {episode}, max Q-value change: {max_diff:.6f}")
        
        # Store current Q-values for next comparison
        previous_q = Q.copy()
    
    episode += 1

# Final statistics
total_episodes = episode
print(f"Training complete after {total_episodes} episodes.")
print(f"Win rate: {wins/total_episodes:.4f}")
print(f"Draw rate: {draws/total_episodes:.4f}")
print(f"Loss rate: {losses/total_episodes:.4f}")


Starting improved training...
Episode 0, epsilon: 1.0000, lr: 0.100000, wins: 0, draws: 0, losses: 0
Episode 10000, epsilon: 0.6065, lr: 0.066669, wins: 3061, draws: 482, losses: 6457
Episode 10000, max Q-value change: 0.832371
Episode 20000, epsilon: 0.3679, lr: 0.050001, wins: 6584, draws: 1139, losses: 12277
Episode 20000, max Q-value change: 0.463592
Episode 30000, epsilon: 0.2231, lr: 0.040001, wins: 10350, draws: 1876, losses: 17774
Episode 30000, max Q-value change: 0.276282
Episode 40000, epsilon: 0.1353, lr: 0.033334, wins: 14239, draws: 2731, losses: 23030
Episode 40000, max Q-value change: 0.219286
Episode 50000, epsilon: 0.0821, lr: 0.028572, wins: 18337, draws: 3627, losses: 28036
EVALUATION: Win rate: 0.4267, Draw rate: 0.0969
Episode 50000, max Q-value change: 0.211416
Episode 60000, epsilon: 0.0498, lr: 0.025000, wins: 22496, draws: 4523, losses: 32981
Episode 60000, max Q-value change: 0.193939
Episode 70000, epsilon: 0.0302, lr: 0.022222, wins: 26623, draws: 5433, los

In [16]:

# Evaluate the final policy with more episodes
print("\nFinal policy evaluation...")
eval_wins = 0
eval_draws = 0
eval_episodes = 100000

for _ in range(eval_episodes):
    state, _ = env.reset()
    state_features = get_state_features(state)
    done = False
    
    while not done:
        # Always choose the best action according to average of both Q-tables
        q_values1 = get_q_values(state_features, Q)
        q_values2 = get_q_values(state_features, Q2)
        avg_q_values = (q_values1 + q_values2) / 2
        action = np.argmax(avg_q_values)
        
        next_state, reward, done, _, _ = env.step(action)
        
        if done and reward > 0:
            eval_wins += 1
        elif done and reward == 0:
            eval_draws += 1
        
        state = next_state
        state_features = get_state_features(state)

print(f"Final evaluation complete.")
print(f"Win rate: {eval_wins/eval_episodes:.4f}")
print(f"Draw rate: {eval_draws/eval_episodes:.4f}")



Final policy evaluation...
Final evaluation complete.
Win rate: 0.4271
Draw rate: 0.0941


In [17]:

# Display policy for critical decision points
print("\nLearned Policy (Player Sum vs Dealer Card):")
print("Player Sum | Dealer's Card | Usable Ace | Best Action | Q(stand) | Q(hit)")
print("-" * 75)

for player_sum in [12, 13, 14, 15, 16, 17, 18, 19, 20]:
    for dealer_card in [1, 6, 10]:  # Dealer showing Ace, 6, 10
        for usable_ace in [0, 1]:    # Hard and soft totals
            state = (player_sum, dealer_card, usable_ace)
            q_values1 = get_q_values(state, Q)
            q_values2 = get_q_values(state, Q2)
            avg_q_values = (q_values1 + q_values2) / 2
            best_action = "Hit" if np.argmax(avg_q_values) == 1 else "Stand"
            print(f"{player_sum:10d} | {dealer_card:12d} | {usable_ace:10d} | {best_action:10s} | {avg_q_values[0]:7.4f} | {avg_q_values[1]:7.4f}")



Learned Policy (Player Sum vs Dealer Card):
Player Sum | Dealer's Card | Usable Ace | Best Action | Q(stand) | Q(hit)
---------------------------------------------------------------------------
        12 |            1 |          0 | Hit        | -0.6659 | -0.4954
        12 |            1 |          1 | Hit        | -0.1234 | -0.1141
        12 |            6 |          0 | Stand      | -0.1501 | -0.2494
        12 |            6 |          1 | Hit        | -0.0015 |  0.0942
        12 |           10 |          0 | Hit        | -0.5323 | -0.4184
        12 |           10 |          1 | Hit        | -0.1748 | -0.1282
        13 |            1 |          0 | Hit        | -0.7091 | -0.5380
        13 |            1 |          1 | Stand      | -0.1076 | -0.1474
        13 |            6 |          0 | Stand      | -0.1514 | -0.3095
        13 |            6 |          1 | Hit        |  0.0183 |  0.1244
        13 |           10 |          0 | Hit        | -0.6062 | -0.4541
        13 | 

In [20]:
# Calculate average Q-values (ensemble approach)
avg_Q = Q.copy()
avg_Q['Action 0 (Stand)'] = (Q['Action 0 (Stand)'] + Q2['Action 0 (Stand)']) / 2
avg_Q['Action 1 (Hit)'] = (Q['Action 1 (Hit)'] + Q2['Action 1 (Hit)']) / 2
avg_Q['Best Action'] = avg_Q.apply(
    lambda row: "Stand" if row['Action 0 (Stand)'] > row['Action 1 (Hit)'] else "Hit", 
    axis=1
)

In [21]:
Q.to_csv('blackjack_q_table1_cpu.csv', index=False)
Q2.to_csv('blackjack_q_table2_cpu.csv', index=False)
avg_Q.to_csv('blackjack_avg_q_table_cpu.csv', index=False)

In [None]:

# Close environment
env.close()
