In [None]:
import random
import pandas as pd
from enum import IntEnum

# -----------------------------
# 1. Define Actions
# -----------------------------
class Action(IntEnum):
    STAND = 0
    HIT = 1
    # You can add later:
    # DOUBLE = 2
    # SPLIT = 3
    # INSURANCE = 4

# -----------------------------
# 2. Helper functions
# -----------------------------
def card_value(card):
    """
    card: int from 1–13 where 1 = Ace, 11 = J, 12 = Q, 13 = K
    We map:
      2–10 -> face value
      J, Q, K -> 10
      A -> 11 (handled with soft logic)
    """
    if card == 1:
        return 11
    if card >= 10:
        return 10
    return card

def hand_value(cards):
    """
    Returns (total, is_soft)
    total: best total <= 21 if possible by adjusting Aces
    is_soft: True if hand contains at least one Ace counted as 11
    """
    total = 0
    aces = 0
    for c in cards:
        v = card_value(c)
        if v == 11:
            aces += 1
        total += v

    is_soft = False
    # Adjust for Aces if bust
    while total > 21 and aces > 0:
        total -= 10  # convert one Ace from 11 -> 1
        aces -= 1

    if aces > 0:
        is_soft = True

    return total, is_soft

# -----------------------------
# 3. Blackjack Environment
# -----------------------------
class BlackjackEnv:
    def __init__(self, n_decks=1):
        self.n_decks = n_decks
        self.deck = []
        self.player_cards = []
        self.dealer_cards = []
        self.done = False
        self.reward = 0

    def _init_deck(self):
        # 1–13 = standard ranks, 4 suits each
        self.deck = self.n_decks * [c for c in range(1, 14)] * 4
        random.shuffle(self.deck)

    def _deal_card(self):
        if not self.deck:
            self._init_deck()
        return self.deck.pop()

    def reset(self):
        """
        Start a new round.
        Returns state = (player_total, is_soft, dealer_upcard_value)
        """
        self.done = False
        self.reward = 0
        self._init_deck()

        self.player_cards = [self._deal_card(), self._deal_card()]
        self.dealer_cards = [self._deal_card(), self._deal_card()]

        player_total, is_soft = hand_value(self.player_cards)
        dealer_upcard_val = card_value(self.dealer_cards[0])

        return (player_total, int(is_soft), dealer_upcard_val)

    def step(self, action: Action):
        """
        action: Action.STAND or Action.HIT
        Returns: next_state, reward, done, info
        """
        if self.done:
            raise ValueError("Round already finished. Call reset().")

        if action == Action.HIT:
            self.player_cards.append(self._deal_card())
            player_total, is_soft = hand_value(self.player_cards)

            # Player busts
            if player_total > 21:
                self.done = True
                self.reward = -1  # lose
            else:
                self.reward = 0  # not decided yet

        elif action == Action.STAND:
            # Dealer plays
            self._play_dealer()
            self.done = True
            self.reward = self._compare_hands()
        else:
            raise ValueError("Invalid action")

        player_total, is_soft = hand_value(self.player_cards)
        dealer_upcard_val = card_value(self.dealer_cards[0])

        next_state = (player_total, int(is_soft), dealer_upcard_val)
        info = {}
        return next_state, self.reward, self.done, info

    def _play_dealer(self):
        """
        Simple dealer rule: hit until total >= 17
        (You can adapt to hit/stand soft 17 variations later)
        """
        total, is_soft = hand_value(self.dealer_cards)
        while total < 17:
            self.dealer_cards.append(self._deal_card())
            total, is_soft = hand_value(self.dealer_cards)

    def _compare_hands(self):
        """
        Returns:
          +1 for player win
           0 for push
          -1 for dealer win
        """
        player_total, _ = hand_value(self.player_cards)
        dealer_total, _ = hand_value(self.dealer_cards)

        if player_total > 21:
            return -1  # already bust

        if dealer_total > 21:
            return +1

        if player_total > dealer_total:
            return +1
        elif player_total < dealer_total:
            return -1
        else:
            return 0


In [3]:
def simple_policy(state):
    """
    state = (player_total, is_soft, dealer_upcard_val)
    returns Action.STAND or Action.HIT
    """
    player_total, is_soft, dealer_upcard = state

    # Very rough baseline strategy
    if player_total <= 11:
        return Action.HIT
    elif 12 <= player_total <= 16:
        if dealer_upcard >= 7:
            return Action.HIT
        else:
            return Action.STAND
    else:
        return Action.STAND


In [4]:
def generate_dataset(n_rounds=50000, seed=0):
    random.seed(seed)
    env = BlackjackEnv(n_decks=4)

    records = []

    for _ in range(n_rounds):
        state = env.reset()
        done = False

        while not done:
            action = simple_policy(state)
            next_state, reward, done, info = env.step(action)

            # Log current decision and final outcome (reward will be final when done=True)
            records.append({
                "player_total": state[0],
                "is_soft": state[1],
                "dealer_upcard": state[2],
                "action": int(action),
                "final_reward": reward if done else None  # we'll fill later
            })

            state = next_state

        # Optionally, you could update final_reward retrospectively for all steps of the round

    df = pd.DataFrame(records)
    # Drop rows with None reward if you want only final states
    df = df.dropna(subset=["final_reward"])
    df["final_reward"] = df["final_reward"].astype(int)
    return df

# Example usage:
if __name__ == "__main__":
    df = generate_dataset(n_rounds=50000)
    print(df.head())
    print(df["action"].value_counts())


   player_total  is_soft  dealer_upcard  action  final_reward
0            18        0              4       0             1
1            14        1              2       0             1
2            20        0             10       0             1
3            19        1              6       0             1
4            20        0             10       0             1
action
0    41283
1     8717
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Generate data
df = generate_dataset(n_rounds=50000, seed=42)

X = df[["player_total", "is_soft", "dealer_upcard"]]
y = df["action"]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

# 3. Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))

# 4. Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=0,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.9499
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      8243
           1       0.89      0.81      0.85      1757

    accuracy                           0.95     10000
   macro avg       0.93      0.90      0.91     10000
weighted avg       0.95      0.95      0.95     10000

Random Forest Accuracy: 1.0
Random Forest Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8243
           1       1.00      1.00      1.00      1757

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

