In [26]:
import json
import pickle
import re
import numpy as np
from pathlib import Path

In [35]:
with open("data/raw/simple_data.pgn", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

#### Tokenise

1. Each legal move from the dataset is one token
2. Filter out all move numbers
3. Filter out results of the game

In [36]:
games = []
for raw_line in lines:
    line = raw_line.strip()
    # TODO: this is a naive approach - make this better
    if line and not line.startswith("["):
        games.append(line)
# games[0]

In [37]:
skip_strings = {"1-0", "0-1", "1/2-1/2", "*"}
move_token_pattern = re.compile(r"\d+\.+$") # removes 1., 2. etc
token_sequences = []
for game in games:
    tokens = []
    for token in game.split(" "):
        if not token or token in skip_strings or move_token_pattern.match(token):
            continue
        tokens.append(token)
    if tokens:
        token_sequences.append(["<bos>", *tokens, "<eos>"])
#token_sequences[:1][0]

In [38]:
vocab = sorted({token for seq in token_sequences for token in seq})
token_to_id = {token: index for index, token in enumerate(vocab)}
id_to_token = {index: token for token, index in token_to_id.items()}

In [39]:
id_sequences = [[token_to_id[token] for token in seq] for seq in token_sequences]

#### Create Training & Validation datasets

In [40]:
context_length = 512 # random number to estimate max moves in a chess game

In [41]:
split_index = int(len(id_sequences) * 0.9)
train_sequences = id_sequences[:split_index]
val_sequences = id_sequences[split_index:]
len(train_sequences), len(val_sequences)

(2538, 283)

In [42]:
train_tokens = []
for seq in train_sequences:
    train_tokens.extend(seq)
val_tokens = []
for seq in val_sequences:
    val_tokens.extend(seq)
len(train_tokens), len(val_tokens)


(185693, 18546)

In [44]:
out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
np.array(train_tokens, dtype=np.uint16).tofile(out_dir / "train.bin")
np.array(val_tokens, dtype=np.uint16).tofile(out_dir / "val.bin")
meta = {
    "vocab_size": len(vocab),
    "itos": vocab,
    "stoi": {token: token_to_id[token] for token in vocab},
    "context_length": context_length
}
with (out_dir / "meta.pkl").open("wb") as f:
    pickle.dump(meta, f)