In [14]:
from environment.blackjack import Blackjack

In [15]:
env = Blackjack()

In [16]:
import torch
import torch.nn as nn

In [17]:
device = torch.accelerator.current_accelerator() if torch.accelerator.is_available else "cpu"
device = "cpu"
print(f"device: {device}")

device: cpu


In [18]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 4)
        )

    def forward(self, x):
        x_norm = x.clone()
        x_norm[:, 0] = x[:, 0] / 21.0
        x_norm[:, 1] = x[:, 1] / 11.0
        logits = self.layers(x_norm)
        return logits

In [19]:
torch.manual_seed(11)
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=4, bias=True)
  )
)


In [20]:
import random
from collections import deque
class ReplayBuffer():
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))


    def sample(self, batch_size):
        return random.sample(self.buffer, min(batch_size, len(self.buffer)))


    def size(self):
        return len(self.buffer)

In [21]:
buffer = ReplayBuffer(100000)

In [22]:
def get_action(state, epsilon, double_possible, splittable):
    if random.random() < epsilon:
        return env.action_random(double_possible, splittable)

    with torch.no_grad():
        q_values = model(state[:, :3])
        if not double_possible:
            q_values[:, 2] = -99.0
        if not splittable:
            q_values[:, 3] = -99.0

    return torch.argmax(q_values).item()

In [23]:
import torch.nn.functional as F
def training(target_model):
    batch = zip(*buffer.sample(batch_size))

    state, action, reward, next_state, done = batch

    state = torch.stack(state).to(device).squeeze(1)
    action = torch.tensor(action).long().unsqueeze(1).to(device)
    reward = torch.tensor(reward).float().unsqueeze(1).to(device)
    next_state = torch.stack(next_state).to(device).squeeze(1)
    done = torch.tensor(done).float().unsqueeze(1).to(device)

    output = model(state[:, :3])
    label = output.gather(dim=1, index=action)

    with torch.no_grad():
        target_q_vals = target_model(next_state[:, :3])

        action_split = (action[:] == 3) + 1.0
        target_q_vals *= action_split

        cannot_double = (next_state[:, -2] == 0)
        cannot_split = (next_state[:, -1] == 0)

        target_q_vals[cannot_double, 2] = -99.0
        target_q_vals[cannot_split, 3] = -99.0
        future_move = torch.amax(target_q_vals, dim=1).unsqueeze(1)
        prediction = reward + (1 - done) * gamma * future_move
    loss = F.smooth_l1_loss(label, prediction)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss

In [54]:
import copy
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
epochs = 1000000
batch_size = 128
gamma = 1.0
epsilon = 1.0
target_model = copy.deepcopy(model)
model.train()
reward_sum = 0

for epoch in range(epochs):

    state = env.next_hand()

    state = torch.tensor(state, device=device).float().unsqueeze(0)
    done = False

    if epoch % 300 == 0:
        target_model.load_state_dict(model.state_dict())

    while not done:
        double_possible = state[:, -2]
        splittable = state[:, -1]

        if buffer.size() < 10 * batch_size:
            action = env.action_random(double_possible, splittable)
        else:
            action = get_action(state, epsilon, double_possible, splittable)

        next_state, reward, hand_over = env.step(action)
        if len(next_state) == 2:
            next_state = next_state[0]
        next_state = torch.tensor(next_state, device=device).float().unsqueeze(0)

        done = hand_over or action == 3

        buffer.push(state, action, reward, next_state, hand_over)

        state = next_state

    reward_sum += reward

    if buffer.size() < 10 * batch_size:
        continue

    loss = training(target_model)

    if epoch % 100000 == 0:
        print(f"epoch: {epoch} | reward net: {round(reward_sum)}")
        reward_sum = 0

    if epoch < 0.75 * epochs:
        epsilon -= 1/(0.75*epochs)
    else:
        epsilon = 0


epoch: 0 | reward net: 1
epoch: 100000 | reward net: -50305
epoch: 200000 | reward net: -44267
epoch: 300000 | reward net: -38855
epoch: 400000 | reward net: -32441
epoch: 500000 | reward net: -25179
epoch: 600000 | reward net: -18555
epoch: 700000 | reward net: -11357
epoch: 800000 | reward net: -5662
epoch: 900000 | reward net: -5165


In [86]:
state = env.next_hand()
state = (8, 5, 0)
print(state)
state = torch.tensor(state, device=device).float().unsqueeze(0)
output = model(state)
print(output)
action = torch.argmax(output).item()

(8, 5, 0)
tensor([[-2.4521,  0.1884,  0.6103, -0.7789]], grad_fn=<AddmmBackward0>)


In [58]:
games = 100000
model.eval()
wins = 0
draws = 0
losses = 0
net_score = 0
for game in range(games):
    state = env.next_hand()
    state = torch.tensor(state, device=device).float().unsqueeze(0)
    done = False

    while not done:

        with torch.no_grad():
            q_values = model(state[:, :3])
            if state[0, -2] == 0:
                q_values[0, -2] = -99
            if state[0, -1] == 0:
                q_values[0, -1] = -99

            action = torch.argmax(q_values).item()

        if action == 3:
            _ = env.step(action)
            next_state = env.next_hand()
            terminated = False

        else:
            next_state, reward, terminated = env.step(action)

        next_state = torch.tensor(next_state, device=device).float().unsqueeze(0)

        done = terminated

        state = next_state

    if reward > 0:
        wins += 1
    elif reward == 0:
        draws += 1
    else:
        losses += 1

    net_score += reward

print(f"Wins: {wins} | Draws: {draws} | Losses: {losses}")
print(f"Win Accuracy: {(wins/games)*100:.2f}% | Draw Accuracy: {(draws/games)*100:.2f}% | Loss Accuracy: {(losses/games)*100:.2f}%")
print(f"Net Score: {net_score}")

Wins: 41692 | Draws: 9024 | Losses: 49284
Win Accuracy: 41.69% | Draw Accuracy: 9.02% | Loss Accuracy: 49.28%
Net Score: -4804.0


In [87]:
torch.save(model.state_dict(), "blackjack_model_weights.pth")