In [None]:
import numpy as np
import pandas as pd
import polars as pl
import os
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import random as rnd
from collections import defaultdict
os.chdir('..')
os.chdir('src')
from env import BlackjackEnv


FARE IMPORTI 1,2,4,8
1,2,3,4
1,2,4,6

#### Heuristics about betting and true count
The most common strategy for betting based on the true count is to use a multiplier for your base bet (say 1) according to the counter.
A reasonable choice for a simple betting strategy is the following:
- Non-positive true count (0 or below): 1x base bet
- Positive mid true count (1 to 5): 4x base bet (could be split in 2x and 4x)
- Positive high true count (6 or above): 8x base bet \
Our goal: make the agent learn this betting strategy, which should yield a higher expected reward than simply betting 1 \
Assumption: all games are played according to the basic strategy

### Load playing strategies

In [103]:
def parse_strategy_csv(file_path):
    try:
        # Read the CSV file
        df = pl.read_csv(file_path)
        
        # Initialize the basic strategy dictionary
        strategy = {}
        
        # Process the dataframe into a dictionary
        for row in df.iter_rows(named=True):
            # Parse the state from string format like '(12, 10, 0)'
            # Extract the state values
            state_str = row['State'].strip('()').split(', ')

            # state_str = row['State'].strip('[]').split()
            player_sum = int(state_str[0])
            dealer_card = int(state_str[1])
            usable_ace = int(state_str[2])
            
            # Create the state key
            state_key = (player_sum, dealer_card, usable_ace)
            
            # Get the action values
            stand_value = row['Action 0 (Stand)']
            hit_value = row['Action 1 (Hit)']
            # Check if the column for double action exists
            if 'Action 2 (Double)' in row:
                double_value = row['Action 2 (Double)']
            else:
                # If not present, set double value to None or some default
                double_value = None
            
            # Store the action values in a dictionary
            strategy[state_key] = {
                0: stand_value,  # Stand
                1: hit_value,    # Hit
                2: double_value   # Double
            }
            
        return strategy
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return {}

In [104]:
os.chdir('..')
os.chdir('strategies')
# Load the strategy CSV file
q_table_strat = parse_strategy_csv('dq_h_s_dd_strat.csv')
basic_strat = parse_strategy_csv('basic_strat_doub.csv')

### Q table initialization for betting

In [105]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1, 4, 8],
    "actions": ["stand", "hit", "double"],
    "num_players": 1
}

env = BlackjackEnv(config)

In [106]:
q_table_bet_1 = {}
q_table_bet_2 = {}
# Initialize the Q-table for betting based on the true count using a dictionary
for i in range(0, 3):
    q_table_bet_1[i] = {bet: 0.0 for bet in config["bet_size"]}
    q_table_bet_2[i] = {bet: 0.0 for bet in config["bet_size"]}
    # Initialize with some bias based on betting theory:
    # - Higher bets are better for high counts (card counting advantage)
    # - Lower bets are better for low or negative counts
    for bet in config["bet_size"]:
        if i >= 1 and bet > 1:  # High count, higher bet might be better
            q_table_bet_1[i][bet] = 0.1 * bet * i / 5 #5
            q_table_bet_2[i][bet] = 0.1 * bet * i / 5 #5
        elif i == 0 and bet == 1:  # Negative count, minimum bet is better
            q_table_bet_1[i][bet] = 0.5
            q_table_bet_2[i][bet] = 0.5
        elif i == 0 and bet > 1:  # Negative count, higher bet is worse
            q_table_bet_1[i][bet] = -0.5
            q_table_bet_2[i][bet] = -0.5

# True count visit tracking for exploration bias
true_count_visits = {i: 0 for i in range(0, 3)}

In [107]:
# Hyperparameters
initial_alpha = 0.1  # Starting learning rate
min_alpha = 0.001  # Minimum learning rate
decay_rate = 0.999995  # Decay rate for learning rate
gamma = 1.0  # Discount factor
starting_epsilon = 1.0  # Exploration rate
epsilon_decay = 0.99999  # Decay rate for exploration
min_epsilon = 0.01  # Minimum exploration rate

### Training functions

In [108]:
def get_true_count(observation):
    true_count = observation["true_count"]
    return true_count

In [109]:
def get_state_features(observation):
    """Extract meaningful features from the observation for basic strategy decisions"""
    # Extract player sum, dealer card, and usable ace
    player_sum = observation["player_score"]
    dealer_card = observation["dealer_upcard"]
    usable_ace = observation["soft_hand"]
    return (player_sum, dealer_card, usable_ace)

In [110]:
def get_q_values(state_features, q_table=q_table_strat):
    """Get Q-values for a given state"""
    if state_features in q_table:
        return np.array([q_table[state_features][0], q_table[state_features][1], q_table[state_features][2]])
    else:
        # Return default values based on player sum
        player_sum = state_features[0]
        if player_sum < 12:
            return np.array([-0.1, 0.5, 0.0])  # Default to hit for low sums
        elif player_sum >= 20:
            return np.array([0.5, -0.1, 0.0])  # Default to stand for high sums
        else:
            return np.array([0.0, 0.0, 0.0])  # Neutral for middle sums

In [111]:
def update_q_value(true_count, bet_amount, reward, next_true_count, lr, terminal=False, q_table=None, target_q_table=None):
    """Update Q-value for true_count-bet pair using Double Q-learning."""
    
    # Current Q-value
    current_q = q_table[true_count][bet_amount]
    if reward > 0:
        reward = 1.0
    elif reward < 0:
        reward = -1.0
    
    if terminal or next_true_count is None:
        # Terminal state - no future rewards
        new_q = current_q + lr * (reward - current_q)
    else:
        # Get the next state's best action from current Q-table
        best_next_bet = max(q_table[next_true_count], key=q_table[next_true_count].get)
        
        # Get Q-value for best action from target Q-table
        max_next_q = target_q_table[next_true_count][best_next_bet]
        
        # Q-learning update formula with future rewards
        new_q = current_q + lr * (reward + gamma * max_next_q - current_q)
    
    # Update the Q-table entry
    q_table[true_count][bet_amount] = new_q
    
    return q_table

In [112]:
def smart_exploration(true_count, epsilon, q_table_1, q_table_2):
    """Smart exploration strategy based on true count visit frequency."""
    
    # Count-based exploration: make rarely seen true counts more likely to be explored
    visit_count = true_count_visits[true_count]
    exploration_bonus = 1.0 / (1.0 + visit_count / 1000)  # Normalize visit count
    adjusted_epsilon = min(0.9, epsilon + exploration_bonus)
    
    # Decide to explore or exploit
    if np.random.rand() < adjusted_epsilon:
        # Explore - but with a bias towards higher bets for high counts
        num_bets = len(config["bet_size"])
        if true_count >= 2:
            # With high count, bias towards higher bets during exploration
            bet_probs = np.linspace(0.1, 0.9, num_bets)
        elif true_count == 0:
            # With low/negative count, bias towards lower bets
            bet_probs = np.linspace(0.9, 0.1, num_bets)
        else:
            # Neutral count, uniform exploration
            bet_probs = np.full(num_bets, 1.0 / num_bets)
        
        # Normalize probabilities to ensure they sum to 1
        bet_probs /= bet_probs.sum()
        bet_amount = np.random.choice(config["bet_size"], p=bet_probs)
    else:
        # Exploit: use average of both Q-tables to determine best bet
        avg_q_values = {}
        for bet in config["bet_size"]:
            avg_q_values[bet] = (q_table_1[true_count][bet] + q_table_2[true_count][bet]) / 2
        
        bet_amount = max(avg_q_values, key=avg_q_values.get)
    
    # Update visit count for this true count
    true_count_visits[true_count] += 1
    
    return bet_amount

In [113]:
def get_adaptive_lr(initial_lr, min_lr, decay_rate, episode, visits=None):
    """Calculate an adaptive learning rate based on episode and visit count."""
    if visits and visits > 100:
        # Slower decay for frequently visited states
        return max(min_lr, initial_lr * (0.999 ** (visits // 100)))
    else:
        # Regular decay based on episode number
        return max(min_lr, initial_lr * (decay_rate ** episode))

### Training process

In [114]:
def discretize_true_count(tc):
    if tc <= 1: return 0
    # elif 1 < tc <= 3: return 1
    # elif 3 < tc < 5: return 2
    elif 1 < tc < 5: return 1
    else: return 2

In [None]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1, 4, 8],
    "actions": ["stand", "hit", "double"],
    "num_players": 1
}
env = BlackjackEnv(config)

# Implement the learning process for betting
print("\nStarting betting learning process...")

episodes = 4000000
epsilon = starting_epsilon
alpha = initial_alpha
episode = 0

stats = {
    'wins': 0,
    'draws': 0,
    'losses': 0,
    'money won': 0,
    'money lost': 0,
}

while episode < episodes:
    # Reset the environment and get the true count
    observation = env.reset()
    true_count = get_true_count(observation)
    # true_count = int(true_count)  # Convert true count to integer for indexing
    true_count = discretize_true_count(true_count)

    bet_amount = smart_exploration(true_count, epsilon, q_table_bet_1, q_table_bet_2)

    # Perform the betting action
    bet_index = config["bet_size"].index(bet_amount)
    state, reward, done, _ = env.step(bet_index, action_type="bet")

    if done:
        # Update Q-value using the reward
        next_true_count = discretize_true_count(state["true_count"])
        if np.random.rand() < 0.5: #epsilon
            q_table_bet_1 = update_q_value(true_count, bet_amount, reward, next_true_count, alpha, terminal=True, q_table=q_table_bet_1, target_q_table=q_table_bet_2)
        else:
            q_table_bet_2 = update_q_value(true_count, bet_amount, reward, next_true_count, alpha, terminal=True, q_table=q_table_bet_2, target_q_table=q_table_bet_1)
        if reward > 0:
            stats['wins'] += 1
            stats['money won'] += reward * bet_amount
        elif reward == 0:
            stats['draws'] += 1
        else:
            stats['losses'] += 1
            stats['money lost'] += abs(reward) * bet_amount
    else:
        # Continue playing the game according to the strategy in q_table_strat
        state_features = get_state_features(state)
        double_down = False
        while not done:

            if state_features[0] < 9:
            # Always hit this state as it's not relevant for our training
                next_state, _, _, _ = env.step(1, action_type="move")
                next_state_features = get_state_features(next_state) if not done else None
                state = next_state
                state_features = next_state_features if next_state is not None else None
                continue

            # Choose the best action based on q_table_strat
            q_values = get_q_values(state_features, basic_strat)
            action = np.argmax(q_values)

            if action == 2:
                # Double down action
                double_down = True

            # Perform the action
            next_state, reward, done, _ = env.step(action, action_type="move")
            state = next_state
            state_features = get_state_features(state)
        
        next_true_count = discretize_true_count(state["true_count"])

        if np.random.rand() < 0.5:
            q_table_bet_1 = update_q_value(true_count, bet_amount, reward, next_true_count, alpha, terminal=done, q_table=q_table_bet_1, target_q_table=q_table_bet_2)
        else:
            q_table_bet_2 = update_q_value(true_count, bet_amount, reward, next_true_count, alpha, terminal=done, q_table=q_table_bet_2, target_q_table=q_table_bet_1)


        # Update metrics
        if reward > 0:
            stats['wins'] += 1
            stats['money won'] += reward * bet_amount
        elif reward == 0:
            stats['draws'] += 1
        else:
            stats['losses'] += 1
            stats['money lost'] += abs(reward) * bet_amount

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    alpha = get_adaptive_lr(
        initial_alpha, 
        min_alpha, 
        decay_rate, 
        episode,
        true_count_visits[true_count]
    )
    episode += 1
    # Print progress every 10000 episodes
    if episode % 10000 == 0:
        print(f"Current Q-values for True Count {true_count}: {q_table_bet_1[true_count]}")
        print(f"Episode {episode}/{episodes} - Epsilon: {epsilon:.4f}, Alpha: {alpha:.4f},")

print("Betting learning process complete.")
# Print statistics
print(f"Total Wins: {stats['wins']}")
print(f"Total Draws: {stats['draws']}")
print(f"Total Losses: {stats['losses']}")
print(f"Total Money Won: {stats['money won']}")
print(f"Total Money Lost: {stats['money lost']}")
print(f"Average reward: ({(stats['money won'] - stats['money lost']) / episodes:.4f})")


Starting betting learning process...
Current Q-values for True Count 0: {1: 0.12025102848155456, 4: 0.22804644394162227, 8: -0.0410032195979468}
Episode 10000/4000000 - Epsilon: 0.9048, Alpha: 0.0930,
Current Q-values for True Count 0: {1: -0.4345273277585509, 4: 0.37702755894434387, 8: 0.05389634952205917}
Episode 20000/4000000 - Epsilon: 0.8187, Alpha: 0.0863,
Current Q-values for True Count 0: {1: -0.4777702695585443, 4: -0.2377821292427275, 8: 0.11129213263565475}
Episode 30000/4000000 - Epsilon: 0.7408, Alpha: 0.0802,
Current Q-values for True Count 0: {1: 0.3782135820409602, 4: -0.05202993597149645, 8: -0.25232520434180394}
Episode 40000/4000000 - Epsilon: 0.6703, Alpha: 0.0745,
Current Q-values for True Count 0: {1: -0.04044020617767699, 4: 0.14848006017386178, 8: 0.08225873616568828}
Episode 50000/4000000 - Epsilon: 0.6065, Alpha: 0.0695,
Current Q-values for True Count 0: {1: -0.1211053747565313, 4: 0.022174764992069506, 8: -0.3089480065406853}
Episode 60000/4000000 - Epsilon

In [116]:
print(true_count_visits)

{0: 2877286, 1: 980279, 2: 142435}


In [117]:
#take the average of the two Q-tables
q_table_bet_avg = {}
for i in range(0, 3):
    q_table_bet_avg[i] = {}
    for bet in config["bet_size"]:
        q_table_bet_avg[i][bet] = (q_table_bet_1[i][bet] + q_table_bet_2[i][bet]) / 2
print(q_table_bet_avg)

{0: {1: -0.008869879950610103, 4: -0.12214480400429666, 8: -0.17273838941958247}, 1: {1: -0.18371143432997283, 4: 0.017849028270614443, 8: -0.17030255077424164}, 2: {1: -0.25912117076492214, 4: -0.2301279196208907, 8: 0.017703018387241497}}


In [118]:
# take the best bet for each true count
best_bet = {}
for i in range(0, 3):
    best_bet[i] = max(q_table_bet_avg[i], key=q_table_bet_avg[i].get)
print(best_bet)

{0: 1, 1: 4, 2: 8}


### Evaluation of betting strategy + basic strategy

In [119]:
# Implement the learning process for betting
print("\nStarting betting testing process...")

eval_episodes = 10000000
episode = 0

config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1, 4, 8],
    "actions": ["stand", "hit", "double"],
    "num_players": 1
}
# Create environment with 6 decks (standard casino configuration)
env = BlackjackEnv(config=config)

eval_stats = {
    'wins': 0,
    'draws': 0,
    'losses': 0,
    'money won': 0,
    'money lost': 0,
}


while episode < eval_episodes:
    # Reset the environment and get the true count
    observation = env.reset()
    true_count = get_true_count(observation)
    true_count = discretize_true_count(true_count)

    bet_amount = max(q_table_bet_avg[true_count], key=q_table_bet_avg[true_count].get)  # Exploit: best bet

    # Perform the betting action
    bet_index = config["bet_size"].index(bet_amount)
    state, reward, done, _ = env.step(bet_index, action_type="bet")

    # Update Q-values for betting
    if done:
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount
    else:
        # Continue playing the game according to the strategy in q_table_strat
        state_features = get_state_features(state)
        while not done:

            if state_features[0] < 9:
            # Always hit this state as it's not relevant for our training
                next_state, _, _, _ = env.step(1, action_type="move")
                next_state_features = get_state_features(next_state) if not done else None
                state = next_state
                state_features = next_state_features if next_state is not None else None
                continue

            # Choose the best action based on q_table_strat
            q_values = get_q_values(state_features, basic_strat)
            action = np.argmax(q_values)

            # if action == 2:
            #     # Double down action
            #     bet_amount *= 2

            # Perform the action
            next_state, reward, done, _ = env.step(action, action_type="move")
            state = next_state
            state_features = get_state_features(state)

        # Update metrics
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount

    episode += 1

# Print statistics
print(f"Total Wins: {eval_stats['wins']}")
print(f"Total Draws: {eval_stats['draws']}")
print(f"Total Losses: {eval_stats['losses']}")
print(f"Total Money Won: {eval_stats['money won']}")
print(f"Total Money Lost: {eval_stats['money lost']}")
print(f"Net profit: {eval_stats['money won'] - eval_stats['money lost']}")
print(f"Average reward: {(eval_stats['money won'] - eval_stats['money lost']) / eval_episodes}")



Starting betting testing process...
Total Wins: 4327310
Total Draws: 855944
Total Losses: 4816746
Total Money Won: 10209632.0
Total Money Lost: 10225693
Net profit: -16061.0
Average reward: -0.0016061


- Total Wins: 4326200
- Total Draws: 857745
- Total Losses: 4816055
- Total Money Won: 8419082.0
- Total Money Lost: 8431736
- Net profit: -12654.0
- Average reward: -0.0012654

### Benchmark with bet size 1 and basic strategy

In [120]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1],
    "actions": ["stand", "hit", "double"],
    "num_players": 1
}
# Create environment with 6 decks (standard casino configuration)
env = BlackjackEnv(config=config)

eval_episodes = 1000000
episode = 0

eval_stats = {
    'wins': 0,
    'draws': 0,
    'losses': 0,
    'money won': 0,
    'money lost': 0,
}

while episode < eval_episodes:
    # Reset the environment and get the true count
    observation = env.reset()
    true_count = get_true_count(observation)
    true_count = discretize_true_count(true_count)

    bet_amount = 1

    # Perform the betting action
    bet_index = config["bet_size"].index(bet_amount)
    state, reward, done, _ = env.step(bet_index, action_type="bet")

    if done:
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount
    else:
        # Continue playing the game according to the strategy in q_table_strat
        state_features = get_state_features(state)
        while not done:

            if state_features[0] < 9:
            # Always hit this state as it's not relevant for our training
                next_state, _, _, _ = env.step(1, action_type="move")
                next_state_features = get_state_features(next_state) if not done else None
                state = next_state
                state_features = next_state_features if next_state is not None else None
                continue

            # Choose the best action based on q_table_strat
            q_values = get_q_values(state_features, basic_strat)
            action = np.argmax(q_values)

            # if action == 2:
            #     # Double down action
            #     bet_amount *= 2

            # Perform the action
            next_state, reward, done, _ = env.step(action, action_type="move")
            state = next_state
            state_features = get_state_features(state)

        # Update metrics
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount

    episode += 1

# Print statistics
print(f"Total Wins: {eval_stats['wins']}")
print(f"Total Draws: {eval_stats['draws']}")
print(f"Total Losses: {eval_stats['losses']}")
print(f"Total Money Won: {eval_stats['money won']}")
print(f"Total Money Lost: {eval_stats['money lost']}")
print(f"Net profit: {eval_stats['money won'] - eval_stats['money lost']}")
print(f"Average reward: {(eval_stats['money won'] - eval_stats['money lost']) / eval_episodes}")


Total Wins: 433076
Total Draws: 85662
Total Losses: 481262
Total Money Won: 515040.0
Total Money Lost: 522244
Net profit: -7204.0
Average reward: -0.007204


Average reward: -0.008368

### Alternative betting strategy (no learning)

In [121]:
def discretize_true_count(tc):
    if tc <= 1: return 0
    elif 1 < tc < 5: return 1
    else: return 2

bet_dict = {
    0: 1,
    1: 4,
    2: 8
}

In [122]:
config = {
    "num_decks": 6,
    "red_card_position": 0.2,
    "bet_size": [1, 4, 8],
    "actions": ["stand", "hit", "double"],
    "num_players": 1
}
# Create environment with 6 decks (standard casino configuration)
env = BlackjackEnv(config=config)

eval_episodes = 10000000
episode = 0

eval_stats = {
    'wins': 0,
    'draws': 0,
    'losses': 0,
    'money won': 0,
    'money lost': 0,
}
true_count_visits = defaultdict(int)
bet_visits = defaultdict(int)

while episode < eval_episodes:
    # Reset the environment and get the true count
    observation = env.reset()
    true_count = get_true_count(observation)
    # keep track of the true count visits
    true_count_visits[int(true_count)] += 1
    # true_count = int(true_count)  # Convert true count to integer for indexing
    true_count = discretize_true_count(true_count)

    bet_amount = bet_dict[true_count]
    # keep track of the bet visits
    bet_visits[bet_amount] += 1

    # Perform the betting action
    bet_index = config["bet_size"].index(bet_amount)
    state, reward, done, _ = env.step(bet_index, action_type="bet")

    # Update Q-values for betting
    if done:
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount
    else:
        # Continue playing the game according to the strategy in q_table_strat
        state_features = get_state_features(state)
        while not done:

            if state_features[0] < 9:
            # Always hit this state as it's not relevant for our training
                next_state, _, _, _ = env.step(1, action_type="move")
                next_state_features = get_state_features(next_state) if not done else None
                state = next_state
                state_features = next_state_features if next_state is not None else None
                continue

            # Choose the best action based on q_table_strat
            q_values = get_q_values(state_features, basic_strat)
            action = np.argmax(q_values)

            # if action == 2:
            #     # Double down action
            #     bet_amount *= 2

            # Perform the action
            next_state, reward, done, _ = env.step(action, action_type="move")
            state = next_state
            state_features = get_state_features(state)

        # Update metrics
        if reward > 0:
            eval_stats['wins'] += 1
            eval_stats['money won'] += reward * bet_amount
        elif reward == 0:
            eval_stats['draws'] += 1
        else:
            eval_stats['losses'] += 1
            eval_stats['money lost'] += abs(reward) * bet_amount

    episode += 1

# Print statistics
print(f"Total Wins: {eval_stats['wins']}")
print(f"Total Draws: {eval_stats['draws']}")
print(f"Total Losses: {eval_stats['losses']}")
print(f"Total Money Won: {eval_stats['money won']}")
print(f"Total Money Lost: {eval_stats['money lost']}")
print(f"Net profit: {eval_stats['money won'] - eval_stats['money lost']}")
print(f"Average reward: {(eval_stats['money won'] - eval_stats['money lost']) / eval_episodes}")

Total Wins: 4325748
Total Draws: 858903
Total Losses: 4815349
Total Money Won: 10169702.0
Total Money Lost: 10185383
Net profit: -15681.0
Average reward: -0.0015681


- Total Wins: 4325748
- Total Draws: 858903
- Total Losses: 4815349
- Total Money Won: 10169702.0
- Total Money Lost: 10185383
- Net profit: -15681.0
- Average reward: -0.0015681

In [123]:
print(true_count_visits)
print(bet_visits)

defaultdict(<class 'int'>, {0: 4322606, -1: 1188920, -2: 687743, -3: 400897, -4: 248084, -5: 144348, 1: 1147962, 2: 667465, 4: 240154, 3: 387510, -6: 87801, 5: 141512, 6: 86593, 7: 50318, 8: 31904, 10: 10588, -8: 32019, -7: 51359, -9: 18021, -10: 10763, -12: 3220, -13: 1938, -11: 5679, 9: 18375, 11: 5434, 12: 3190, 14: 942, 13: 1897, -14: 964, -15: 451, -16: 250, 15: 470, 16: 243, 18: 59, 17: 117, 19: 11, -18: 52, -17: 112, -20: 6, -19: 19, -21: 3, 20: 1})
defaultdict(<class 'int'>, {1: 7212725, 4: 2435621, 8: 351654})


In [124]:
# os.chdir('..')
# os.chdir('strategies')
# # save the Q-table to a CSV file
# q_table_bet_avg_df = pd.DataFrame.from_dict(q_table_bet_avg, orient='index')
# q_table_bet_avg_df.to_csv('q_table_bet_avg.csv', index=True, header=True)