In [None]:
import random
from tqdm import tqdm
from datasets import load_dataset, Dataset
import chess
from attack_threat_prompt import build_llm_prompt
import os
from dotenv import load_dotenv
from datasets import load_dataset, Dataset

In [None]:
load_dotenv()

from huggingface_hub import login

login(os.environ['HF_TOKEN'])

In [None]:
# Configuration
# Attack-Threat Map only works with White's perspective for now
# So set move interval to 2
HF_DATASET_NAME = "austindavis/lichess-elite-uci"
OUTPUT_PATH = "chess-attack-threat-dataset"
MAX_GAMES = 30000
MOVE_INTERVAL = 2
SKIP_FIRST_N_MOVES = 4


def extract_prompt_move_pairs(uci_string):
    board = chess.Board()
    prompts = []
    moves = uci_string.split()

    for i, move_uci in enumerate(moves):
        move = chess.Move.from_uci(move_uci)
        if move not in board.legal_moves:
            break
        if i >= SKIP_FIRST_N_MOVES and (i - SKIP_FIRST_N_MOVES) % MOVE_INTERVAL == 0:
            prompt = build_llm_prompt(board)
            best_move_san = board.san(move)
            prompts.append({"prompt": prompt, "response": f"MOVE: {best_move_san}"})
        board.push(move)

    return prompts

# Load the dataset
print("Loading from HuggingFace...")
ds = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
ds = ds.select(range(MAX_GAMES)) 

# Process games
print("Processing games...")
all_examples = []
for row in tqdm(ds):
    try:
        examples = extract_prompt_move_pairs(row["text"])
        all_examples.extend(examples)
    except Exception as e:
        continue

# Save to disk
hf_dataset = Dataset.from_list(all_examples)
hf_dataset.save_to_disk(OUTPUT_PATH)

print("Done.")


In [None]:
hf_dataset