## Imports

In [1]:
# Set the working directory
import os
# Python script
# current_dir = os.path.dirname(os.path.abspath(__file__))
# cwd = os.path.dirname(current_dir)
# os.chdir(cwd)
# Notebook
os.chdir(os.path.join(os.path.dirname(os.getcwd()), "src"))

import numpy as np
import pandas as pd
import random
from env import BlackjackEnv
from visualize import plot_policy
import torch
import torch.nn as nn
import torch.optim as optim


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


## Architecture

In [6]:
class Dqn(nn.Module):
    def __init__(self, state_size, action_size):
        super(Dqn, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, action_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        y = self.fc4(x)
        return y

In [7]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def __len__(self):
        return len(self.buffer)

    def push(self, transition):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = transition
        self.position = (self.position+1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

## Agent

In [8]:
class DqnAgent:
    def __init__(self, agent_config):

        self.actions = agent_config["actions"]
        self.action_size = len(self.actions)
        self.batch_size = agent_config["batch_size"]
        self.gamma = agent_config["gamma"]
        self.epsilon_max = agent_config["epsilon_max"]
        self.current_epsilon = self.epsilon_max
        self.epsilon_min = agent_config["epsilon_min"]
        self.epsilon_decay = agent_config["epsilon_decay"]
        self.memory_size = agent_config["memory_size"]
        self.lr = agent_config["lr"]
        self.state_size = agent_config["state_size"]

        self.policy_net = Dqn(state_size=self.state_size, action_size=self.action_size).to(DEVICE)
        self.target_net = Dqn(state_size=self.state_size, action_size=self.action_size).to(DEVICE)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        # self.criterion = nn.HuberLoss()
        self.criterion = nn.MSELoss()

        self.optimizer = optim.SGD(
            self.policy_net.parameters(),
            lr=self.lr,
            momentum=0.9,
            weight_decay=1e-4
        )
        self.scheduler = optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=10000,
            gamma=0.95)

        self.replay_buffer = ReplayBuffer(self.memory_size)
    

    def preprocess_state(self, state):
        if state[0]:
            state = np.array([
                state[0] / 21.0,
                state[1] / 10.0,
                state[2],
                state[3]
            ], dtype=np.float32)
        
        return state
    

    def select_action(self, env, state):
        if random.random() < self.current_epsilon:
            return env.move_space.sample()
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                q_values = self.policy_net(state)
            return q_values.argmax().item()
    

    def update_epsilon(self, episode):
        self.current_epsilon = max(self.epsilon_min, self.epsilon_max * (self.epsilon_decay**episode))
    

    def train_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = self.replay_buffer.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states).to(DEVICE)
        actions = torch.LongTensor(actions).to(DEVICE)
        rewards = torch.FloatTensor(rewards).to(DEVICE)
        next_states = torch.FloatTensor(next_states).to(DEVICE)
        dones = torch.LongTensor(dones).to(DEVICE)

        # Compute the target Q-values
        Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        # Use the Bellman equation to update the Q-values
        # Q(s,a) = r + γ * max_a'(Q(s',a'))
        max_a_Q_prime = self.target_net(next_states).max(1)[0].detach()
        new_Q = rewards + (1-dones) * self.gamma * max_a_Q_prime

        # Compute the loss and update the network
        loss = self.criterion(Q, new_Q)
        self.optimizer.zero_grad()
        loss.backward()
        # nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.optimizer.step()
        self.scheduler.step()

        return loss.item()

## Components initialization

In [9]:
# Initialize the environment
ENV_CONFIG = {
    "num_decks"         : 6,
    "red_card_position" : 0.2,
    "bet_size"          : [1],
    "actions"           : ["stand", "hit"],
    "num_players"       : 1
}

env = BlackjackEnv(config=ENV_CONFIG)
env.__dict__

{'num_decks': 6,
 'cut_card_position': 0.2,
 'bets': [1],
 'actions': ['stand', 'hit'],
 'num_players': 1,
 'bet_space': Discrete(1),
 'move_space': Discrete(2),
 'observation_space': Tuple(Discrete(32), Discrete(11), Discrete(2), Box(0.0, 1.0, (11,), float64)),
 'table': <blackjack.Table at 0x188a12a6600>}

In [10]:
# Initialize the agent
AGENT_CONFIG = {
    "actions"      : ["stand", "hit"],
    "batch_size"   : 512,
    "gamma"        : 0.999,
    "epsilon_max"  : 1.0,
    "epsilon_min"  : 0.01,
    "epsilon_decay": 0.9995,
    "memory_size"  : 20000,
    "lr"           : 0.1,
    "state_size"   : 4
}

agent = DqnAgent(agent_config=AGENT_CONFIG)
agent.__dict__

{'actions': ['stand', 'hit'],
 'action_size': 2,
 'batch_size': 512,
 'gamma': 0.999,
 'epsilon_max': 1.0,
 'current_epsilon': 1.0,
 'epsilon_min': 0.01,
 'epsilon_decay': 0.9995,
 'memory_size': 20000,
 'lr': 0.1,
 'state_size': 4,
 'policy_net': Dqn(
   (fc1): Linear(in_features=4, out_features=64, bias=True)
   (relu1): ReLU()
   (fc2): Linear(in_features=64, out_features=128, bias=True)
   (relu2): ReLU()
   (fc3): Linear(in_features=128, out_features=64, bias=True)
   (relu3): ReLU()
   (fc4): Linear(in_features=64, out_features=2, bias=True)
 ),
 'target_net': Dqn(
   (fc1): Linear(in_features=4, out_features=64, bias=True)
   (relu1): ReLU()
   (fc2): Linear(in_features=64, out_features=128, bias=True)
   (relu2): ReLU()
   (fc3): Linear(in_features=128, out_features=64, bias=True)
   (relu3): ReLU()
   (fc4): Linear(in_features=64, out_features=2, bias=True)
 ),
 'criterion': MSELoss(),
 'optimizer': SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     for

## Train

In [12]:
PARAMS = {
    "architecture"  : "",
    "num_episodes"  : 20000,
    "verbose"       : False,
}

In [13]:
verbose = PARAMS["verbose"]
rewards_money = []
losses_money = []
wins = 0
draws = 0
losses = 0

for episode in range(PARAMS["num_episodes"]):
    if verbose:
        print("-------------------- Starting episode", episode+1)
    state = env.reset()
    state = agent.preprocess_state(state)
    done = False
    episode_loss = []

    if verbose:
        print(f"True count: {state[-1]:.4f}")

    # Place a random bet
    # TODO: introduce another network to predict the bet size
    bet_action = env.bet_space.sample()
    state, reward, done = env.step(bet_action, action_type="bet")
    if verbose:
        print(f"----- Bet: {env.bets[bet_action]}")
    state = agent.preprocess_state(state)

    if verbose:
        print(env.table.players[0])
        print(env.table.dealer)
    if not done:
        if verbose:
            print("----- Making moves...")
        
        # Continue until the episode is done
        while not done:
            action = agent.select_action(env, state)
            next_state, reward, done = env.step(action, action_type="move")
            next_state = agent.preprocess_state(next_state)

            agent.replay_buffer.push((state, action, reward, next_state, done))
            state = next_state
            loss = agent.train_step()
            episode_loss.append(loss if loss is not None else 0)

        if verbose:
            print(env.table.players[0])
            print(env.table.dealer)
    if verbose:
        print(f"----- Reward: {reward}")
    if reward > 0:
        wins += 1
    elif reward == -1:
        losses += 1
    else:
        draws += 1

    if episode % 10 == 0:
        # Update the target network
        agent.target_net.load_state_dict(agent.policy_net.state_dict())
        agent.update_epsilon(episode)
    
    # print(episode_loss)
    rewards_money.append(reward)
    losses_money.append(np.mean(episode_loss).item() if episode_loss else 0)
    if verbose:
        print("-------------------- Terminated")

    if episode > 0 and episode % 1000 == 0:
        avg_reward = np.mean(rewards_money[-1000:])
        avg_loss = np.mean(losses_money[-1000:])
        curr_lr = float(agent.optimizer.param_groups[0]["lr"])
        print(f"Episode {episode}\tAvg reward: {avg_reward:.4f},\tWin rate: {wins/episode:.4f},\tDraw rate: {draws/episode:.4f},\tLoss rate: {losses/episode:.4f}")
        print(f"\t\tLoss: {avg_loss:.4f},\t\tLr: {curr_lr:.4f},\t\tEpsilon: {agent.current_epsilon:.4f}")


  states = torch.FloatTensor(states).to(DEVICE)


Episode 1000	Avg reward: -0.3325,	Win rate: 0.3010,	Draw rate: 0.0440,	Loss rate: 0.6560
		Loss: 0.2437,		Lr: 0.1000,		Epsilon: 0.6065
Episode 2000	Avg reward: -0.1815,	Win rate: 0.3385,	Draw rate: 0.0460,	Loss rate: 0.6160
		Loss: 0.4361,		Lr: 0.1000,		Epsilon: 0.3678
Episode 3000	Avg reward: -0.1440,	Win rate: 0.3547,	Draw rate: 0.0507,	Loss rate: 0.5950
		Loss: 0.4639,		Lr: 0.1000,		Epsilon: 0.2230
Episode 4000	Avg reward: -0.0685,	Win rate: 0.3733,	Draw rate: 0.0498,	Loss rate: 0.5773
		Loss: 0.4711,		Lr: 0.1000,		Epsilon: 0.1353
Episode 5000	Avg reward: -0.0460,	Win rate: 0.3840,	Draw rate: 0.0550,	Loss rate: 0.5612
		Loss: 0.4812,		Lr: 0.1000,		Epsilon: 0.0820
Episode 6000	Avg reward: -0.1055,	Win rate: 0.3877,	Draw rate: 0.0568,	Loss rate: 0.5557
		Loss: 0.4881,		Lr: 0.1000,		Epsilon: 0.0497
Episode 7000	Avg reward: -0.0250,	Win rate: 0.3949,	Draw rate: 0.0593,	Loss rate: 0.5460
		Loss: 0.4843,		Lr: 0.1000,		Epsilon: 0.0302
Episode 8000	Avg reward: -0.0700,	Win rate: 0.3980,	Dra

In [14]:
# Evaluate the agent
env = BlackjackEnv(config=ENV_CONFIG)
total_rewards = []
wins = 0
draws = 0
losses = 0

for _ in range(10000):
    state = env.reset()
    state = agent.preprocess_state(state)
    done = False

    bet_action = env.bet_space.sample()
    state, reward, done = env.step(bet_action, action_type="bet")
    state = agent.preprocess_state(state)

    if not done:
        while not done:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
                action = agent.policy_net(state_tensor).argmax().item()
                
            next_state, reward, done = env.step(action, action_type="move")
            next_state = agent.preprocess_state(next_state)
    
    if reward > 0:
        wins += 1
    elif reward == -1:
        losses += 1
    else:
        draws += 1
    total_rewards.append(reward)

avg_reward = np.mean(total_rewards)
print(f"Evaluation Results:")
print(f"Average Reward: {avg_reward:.4f}")
print(f"Win Rate: {wins/10000:.4f}")
print(f"Draw Rate: {draws/10000:.4f}")
print(f"Loss Rate: {losses/10000:.4f}")


Evaluation Results:
Average Reward: -0.3758
Win Rate: 0.2784
Draw Rate: 0.0456
Loss Rate: 0.6760


In [18]:
def save_strategy(agent, file_path):

    strategy = pd.DataFrame(columns=["State", "Action"])
    
    for player_hand in range(4, 22):
        for dealer_hand in range(2, 12):
            for soft_hand in range(2):
                for true_count in range(-4, 5):
                    state = (player_hand, dealer_hand, soft_hand, true_count)
                    actions = []

                    with torch.no_grad():
                        state = torch.FloatTensor([player_hand, dealer_hand, soft_hand, true_count]).unsqueeze(0).to(DEVICE)
                        q_values = agent.policy_net(state).argmax().item()
                        actions.append(q_values)

                state = f"({player_hand}, {dealer_hand}, {soft_hand})"
                action = np.mean(actions).item()
                strategy = strategy.append({"State": state, "Action": action}, ignore_index=True)
    
    strategy.to_csv(file_path, index=False)

In [None]:
save_strategy(agent, "test.csv")