In [None]:
import os
import chess
import json
import random
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import load_dataset, Dataset

In [None]:
load_dotenv()

from huggingface_hub import login

login(os.environ['HF_TOKEN'])

In [None]:
# Configs
HF_DATASET_NAME = "austindavis/lichess-elite-uci"
OUTPUT_PATH = "chess-sft-dataset-large"
MAX_GAMES = 60000
MOVE_INTERVAL = 1
SKIP_FIRST_N_MOVES = 4

PROMPT_TEMPLATES = [
    "Current chess position (FEN): {fen}\nWhat is the best move in this position? Respond with just the move in standard algebraic notation.",
    "Given this position (FEN): {fen}, what is the best move? Reply only with the move in algebraic notation.",
    "Position: {fen}\nBest move?",
    "Here is a chess board (FEN): {fen}\nYour move (SAN only):",
    "Given the following board (in FEN): {fen}, suggest the best move (use SAN).",
    "Current FEN: {fen}\nReply with only the move in standard algebraic notation (SAN):",
]

def extract_prompt_move_pairs(uci_string):
    board = chess.Board()
    prompts = []
    moves = uci_string.split()

    for i, move_uci in enumerate(moves):
        move = chess.Move.from_uci(move_uci)
        if move not in board.legal_moves:
            break
        if i >= SKIP_FIRST_N_MOVES and (i - SKIP_FIRST_N_MOVES) % MOVE_INTERVAL == 0:
            prompt_template = random.choice(PROMPT_TEMPLATES)
            prompt = prompt_template.format(fen=board.fen())
            best_move_san = board.san(move)
            prompts.append({"prompt": prompt, "response": best_move_san})
            # prompts.append({"prompt": prompt})
        board.push(move)

    return prompts

# Load the dataset
print("Loading from HuggingFace...")
ds = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
ds = ds.select(range(MAX_GAMES))

# Process games
print("Processing games...")
all_examples = []
for row in tqdm(ds):
    try:
        examples = extract_prompt_move_pairs(row["text"])
        all_examples.extend(examples)
    except Exception:
        continue

# Convert to HuggingFace Dataset directly
hf_dataset = Dataset.from_list(all_examples)
hf_dataset.save_to_disk(OUTPUT_PATH)

print("Done.")

In [None]:
# Check
hf_dataset = Dataset.load_from_disk(OUTPUT_PATH)

# Inspect the first few examples
print(hf_dataset[:5])  # First 5 rows

In [None]:
hf_dataset