In [121]:
import gymnasium as gym

In [122]:
import torch
env = gym.make("Blackjack-v1", sab=True)
env.reset()
env.step(1)

((21, 3, 0), 0.0, False, False, {})

In [123]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [124]:
device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else "cpu"
print(device)

mps


In [125]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [126]:
torch.manual_seed(11)
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [127]:
import random
class ReplayBuffer():
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, min(batch_size, len(self.buffer)))

    def size(self):
        return len(self.buffer)

In [128]:
buffer = ReplayBuffer(10000)

In [129]:
def training(target_model):
    batch = zip(*buffer.sample(batch_size))

    state, action, reward, next_state, done = batch

    state = torch.stack(state).float()
    action = torch.tensor(action).int().unsqueeze(1)
    reward = torch.tensor(reward).float()
    next_state = torch.stack(next_state).float()
    done = torch.tensor(done).float()

    output = model(state).cpu()
    label = output.gather(dim=1, index=action)

    with torch.no_grad():
        future_move = torch.amax(target_model(next_state).cpu(), dim=1)
        prediction = reward + (1 - done) * gamma * future_move
        prediction.unsqueeze_(1)

    loss = F.mse_loss(label, prediction)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss


In [130]:
import copy
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
epochs = 100000
batch_size = 128
gamma = 0.99
epsilon = 0.99
target_model = copy.deepcopy(model)
game_won = 0

for epoch in range(epochs):
    #setup and first hand
    model.train()
    state, _ = env.reset()
    state = torch.tensor(state, device=device).float()
    done = False

    if epoch % 1000 == 0:
        target_model.load_state_dict(model.state_dict())

    #loop runs as long as the hand is not over
    while not done:

        #the model will make random decisions until there is enough data to start training
        if buffer.size() < batch_size:
            action = env.action_space.sample()
        else:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = torch.argmax(model(state)).item()


        #After doing the decided action, I will get the next state, reward and whether the game is done
        next_state, reward, terminated, truncated, info = env.step(action)
        next_state = torch.tensor(next_state, device=device).float()

        done = terminated or truncated

        #the game state gets stored for training
        buffer.push(state, action, reward, next_state, done)

        state = next_state

    #if I have enough data the training begins
    if buffer.size() < batch_size:
        continue

    loss = training(target_model)

    if reward == 1.0:
        game_won += 1

    if epoch % 1000 == 0:
        accuracy = game_won / 1000
        print(f"epoch: {epoch}, Accuracy: {accuracy}")
        game_won = 0

    if epsilon > 0.1:
        epsilon *= 0.9999

epoch: 1000, Accuracy: 0.265
epoch: 2000, Accuracy: 0.302
epoch: 3000, Accuracy: 0.311
epoch: 4000, Accuracy: 0.33
epoch: 5000, Accuracy: 0.342
epoch: 6000, Accuracy: 0.371
epoch: 7000, Accuracy: 0.365
epoch: 8000, Accuracy: 0.365
epoch: 9000, Accuracy: 0.359
epoch: 10000, Accuracy: 0.362
epoch: 11000, Accuracy: 0.398
epoch: 12000, Accuracy: 0.366
epoch: 13000, Accuracy: 0.391
epoch: 14000, Accuracy: 0.388
epoch: 15000, Accuracy: 0.418
epoch: 16000, Accuracy: 0.394
epoch: 17000, Accuracy: 0.393
epoch: 18000, Accuracy: 0.399
epoch: 19000, Accuracy: 0.383
epoch: 20000, Accuracy: 0.426
epoch: 21000, Accuracy: 0.398
epoch: 22000, Accuracy: 0.415
epoch: 23000, Accuracy: 0.409
epoch: 24000, Accuracy: 0.411
epoch: 25000, Accuracy: 0.409
epoch: 26000, Accuracy: 0.411
epoch: 27000, Accuracy: 0.405
epoch: 28000, Accuracy: 0.408
epoch: 29000, Accuracy: 0.42
epoch: 30000, Accuracy: 0.425
epoch: 31000, Accuracy: 0.408
epoch: 32000, Accuracy: 0.371
epoch: 33000, Accuracy: 0.438
epoch: 34000, Accurac

In [194]:
state,_ = env.reset()
print(state)
state = torch.tensor(state, device=device).float()
output = model(state)
print(output)
action = torch.argmax(output).item()
print("hit" if action == 1 else "stay")

(17, 4, 0)
tensor([-0.0979, -0.6960], device='mps:0', grad_fn=<LinearBackward0>)
stay


In [209]:
games = 1000
model.eval()
wins = 0
draws = 0
losses = 0
net_score = 0
for game in range(games):
    state, _ = env.reset()
    state = torch.tensor(state, device=device).float()
    done = False

    while not done:

        with torch.no_grad():
            q_values = model(state)
            action = torch.argmax(q_values).item()

        next_state, reward, terminated, truncated, info = env.step(action)
        next_state = torch.tensor(next_state, device=device).float()

        done = terminated or truncated

        state = next_state

    if reward > 0:
        wins += 1
    elif reward == 0:
        draws += 1
    else:
        losses += 1

    net_score += reward

print(f"Wins: {wins}, Draws: {draws}, Losses: {losses}")
print(f"Win Accuracy: {(wins/games)*100:.2f}%, Draw Accuracy: {(draws/games)*100:.2f}%, Loss Accuracy: {(losses/games)*100:.2f}%")
print(f"Net Score: {net_score}")

Wins: 445, Draws: 72, Losses: 483
Win Accuracy: 44.50%, Draw Accuracy: 7.20%, Loss Accuracy: 48.30%
Net Score: -38.0


In [211]:
print(1)
print(2)
print(3)
ösajdfklödsf

1
2
