In [None]:
%pip install python-chess

In [6]:
from datasets import load_dataset
from itertools import islice
from google.colab import userdata
api = userdata.get('HF_TOKEN')
ds = load_dataset(
    "Lichess/standard-chess-games",
    split="train",
    streaming=True,
    token=api
)



Resolving data files:   0%|          | 0/26138 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26138 [00:00<?, ?it/s]

In [7]:
ds

IterableDataset({
    features: ['Event', 'Site', 'White', 'Black', 'Result', 'WhiteTitle', 'BlackTitle', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'UTCDate', 'UTCTime', 'ECO', 'Opening', 'Termination', 'TimeControl', 'movetext'],
    num_shards: 26138
})

In [8]:
ds = ds.select_columns(['movetext',
    "WhiteElo",
    "BlackElo",
    "WhiteRatingDiff",
    "BlackRatingDiff",
    "Opening",
    "ECO"])

In [9]:
ds

IterableDataset({
    features: ['movetext', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'Opening', 'ECO'],
    num_shards: 26138
})

In [38]:
len(small_ds)

2000

In [12]:
ds = ds.rename_columns({
    "WhiteElo": "white_elo",
    "BlackElo": "black_elo",
    "WhiteRatingDiff": "white_diff",
    "BlackRatingDiff": "black_diff",
    "Opening": "opening",
    "ECO": "eco"
})


In [14]:
ds.features

{'movetext': Value('string'),
 'white_elo': Value('int16'),
 'black_elo': Value('int16'),
 'white_diff': Value('int16'),
 'black_diff': Value('int16'),
 'opening': Value('string'),
 'eco': Value('string')}

In [16]:
import re

def clean_movetext(movetext: str):
    # remove result like 1-0, 0-1, 1/2-1/2
    movetext = re.sub(r"1-0|0-1|1/2-1/2", "", movetext)

    # remove move numbers like "1.", "23..."
    movetext = re.sub(r"\d+\.", "", movetext)

    # split into tokens
    moves = movetext.strip().split()

    return moves


In [26]:
import chess

def generate_samples(movetext):
    """
    Returns list of (board_fen, move_uci)
    """
    board = chess.Board()
    samples = []

    moves = clean_movetext(movetext)
    print(moves)
    for san in moves:
        try:
            move = board.parse_san(san)
        except Exception:
            # invalid or corrupted move → skip game
            return []

        # save state BEFORE move
        board_fen = board.fen()
        move_uci = move.uci()

        samples.append((board_fen, move_uci))

        board.push(move)

    return samples


In [27]:
sample = next(iter(ds))

samples = generate_samples(sample["movetext"])

print("Number of move-samples:", len(samples))
print("First sample:")
print("Board FEN:", samples[0][0])
print("Next move (UCI):", samples[0][1])


['e4', 'e6', 'd4', 'b6', 'a3', 'Bb7', 'Nc3', 'Nh6', 'Bxh6', 'gxh6', 'Be2', 'Qg5', 'Bg4', 'h5', 'Nf3', 'Qg6', 'Nh4', 'Qg5', 'Bxh5', 'Qxh4', 'Qf3', 'Kd8', 'Qxf7', 'Nc6', 'Qe8#']
Number of move-samples: 25
First sample:
Board FEN: rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1
Next move (UCI): e2e4


In [None]:
all_samples = []

for i, row in enumerate(ds):
    game_samples = generate_samples(row["movetext"])
    all_samples.extend(game_samples)

    if i == 5:   # only first 5 games for testing
        break

print("Total samples collected:", len(all_samples))


In [None]:
all_samples.sample(5)

In [20]:
import chess
import numpy as np

PIECE_TO_CHANNEL = {
    chess.PAWN:   0,
    chess.KNIGHT: 1,
    chess.BISHOP: 2,
    chess.ROOK:   3,
    chess.QUEEN:  4,
    chess.KING:   5
}

def board_to_tensor(fen: str):
    board = chess.Board(fen)
    tensor = np.zeros((12, 8, 8), dtype=np.float32)

    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is None:
            continue

        row = 7 - (square // 8)
        col = square % 8

        channel = PIECE_TO_CHANNEL[piece.piece_type]
        if piece.color == chess.BLACK:
            channel += 6

        tensor[channel, row, col] = 1.0

    return tensor


In [21]:
fen = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"

tensor = board_to_tensor(fen)

print(tensor.shape)          # (12, 8, 8)
print(tensor.sum())          # 32 pieces


(12, 8, 8)
32.0


In [50]:
import pandas as pd
small_ds = list(islice(ds, 4000))
df = pd.DataFrame(small_ds)
len(df)

4000

In [None]:
df.to_csv("small_dataset.csv", index=False)