In [1]:
import chess
import h5py
import numpy as np
import pandas as pd

# Define the board positions mapping
board_positions = {
    'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7
}

# Function to convert chess square to board index
def square_to_index(square):
    letter = chess.square_name(square)
    row = 8 - int(letter[1])
    column = board_positions[letter[0]]
    return row, column

# Function to convert board to matrix
def board_to_matrix(board):
    board_3d = np.zeros((14, 8, 8), dtype=np.int8)

    for piece in chess.PIECE_TYPES:
        for square in board.pieces(piece, chess.WHITE):
            index = np.unravel_index(square, (8, 8))
            board_3d[piece - 1][7 - index[0]][index[1]] = 1

        for square in board.pieces(piece, chess.BLACK):
            index = np.unravel_index(square, (8, 8))
            board_3d[piece + 5][7 - index[0]][index[1]] = 1

    aux = board.turn
    board.turn = chess.WHITE
    for move in board.legal_moves:
        i, j = square_to_index(move.to_square)
        board_3d[12][i][j] = 1

    board.turn = chess.BLACK
    for move in board.legal_moves:
        i, j = square_to_index(move.to_square)
        board_3d[13][i][j] = 1

    board.turn = aux

    return board_3d

# Function to pad lists to ensure they have five elements
def pad_list(lst, length=5, pad_value=None):
    return lst + [pad_value] * (length - len(lst))

# Function to process and save boards
def process_and_save_boards(csv_file, output_file, batch_size=1000):
    df = pd.read_csv(csv_file)
    total_boards = len(df)
    
    with h5py.File(output_file, "w") as hf_out:
        hf_out.create_dataset("board_matrix", (total_boards, 14, 8, 8), dtype=np.int8)
        hf_out.create_dataset("GameID", (total_boards,), dtype=np.int32)
        hf_out.create_dataset("MoveID", (total_boards,), dtype=np.int32)
        hf_out.create_dataset("PlayerMove", (total_boards,), dtype=h5py.string_dtype(encoding='utf-8'))
        hf_out.create_dataset("TopMoves", (total_boards, 5), dtype=h5py.string_dtype(encoding='utf-8'))
        hf_out.create_dataset("BestMove", (total_boards,), dtype=h5py.string_dtype(encoding='utf-8'))
        hf_out.create_dataset("Centipawns", (total_boards, 5),  dtype=np.int32)
        hf_out.create_dataset("Mates", (total_boards, 5),  dtype=np.int32)
        hf_out.create_dataset("MoveSequence", (total_boards, 5), dtype=h5py.string_dtype(encoding='utf-8'))
        hf_out.create_dataset("Player", (total_boards,), dtype=h5py.string_dtype(encoding='utf-8'))

        count = 0

        while count < total_boards:
            batch = df[count:count+batch_size]
            for idx, row in batch.iterrows():
                fen = row["FEN"]
                board = chess.Board(fen)
                board_matrix = board_to_matrix(board)
                
                hf_out["board_matrix"][idx] = board_matrix
                hf_out["GameID"][idx] = row["GameID"]
                hf_out["MoveID"][idx] = row["MoveID"]
                hf_out["PlayerMove"][idx] = row["Move"]
                hf_out["TopMoves"][idx] = pad_list(row["TopMoves"].strip('[]').replace("'", "").split(", "), 5, "")
                hf_out["BestMove"][idx] = row["BestMove"]
                
                # Convert Centipawns and Mates from string to list of integers, pad if necessary
                hf_out["Centipawns"][idx] = pad_list([int(x) if x != 'None' else 0 for x in row["Centipawns"].strip('[]').split(', ')], 5, 0)
                hf_out["Mates"][idx] = pad_list([int(x) if x != 'None' else 0 for x in row["Mates"].strip('[]').split(', ')], 5, 0)
                
                hf_out["MoveSequence"][idx] = pad_list(row["MoveSequence"].strip('[]').replace("'", "").split(", "), 5, "")
                hf_out["Player"][idx] = row["Player"]
            count += batch_size
            print(f"Processed {min(count, total_boards)} / {total_boards} boards.")

        print("Processing complete.")

if __name__ == "__main__":
    csv_file = "./enhanced_magnus_moves_dataset_v3.csv"  # replace with your CSV file path
    output_file = "magnus_datasetv3.h5"  # replace with your desired output file path
    process_and_save_boards(csv_file, output_file)


Processed 1000 / 223622 boards.
Processed 2000 / 223622 boards.
Processed 3000 / 223622 boards.
Processed 4000 / 223622 boards.
Processed 5000 / 223622 boards.
Processed 6000 / 223622 boards.
Processed 7000 / 223622 boards.
Processed 8000 / 223622 boards.
Processed 9000 / 223622 boards.
Processed 10000 / 223622 boards.
Processed 11000 / 223622 boards.
Processed 12000 / 223622 boards.
Processed 13000 / 223622 boards.
Processed 14000 / 223622 boards.
Processed 15000 / 223622 boards.
Processed 16000 / 223622 boards.
Processed 17000 / 223622 boards.
Processed 18000 / 223622 boards.
Processed 19000 / 223622 boards.
Processed 20000 / 223622 boards.
Processed 21000 / 223622 boards.
Processed 22000 / 223622 boards.
Processed 23000 / 223622 boards.
Processed 24000 / 223622 boards.
Processed 25000 / 223622 boards.
Processed 26000 / 223622 boards.
Processed 27000 / 223622 boards.
Processed 28000 / 223622 boards.
Processed 29000 / 223622 boards.
Processed 30000 / 223622 boards.
Processed 31000 / 2