In [1]:
import os
import h5py
import numpy as np
from tqdm import tqdm
import utils.data_utils as dpu

source = "lc0"

boards_hdf_path = f"/root/chess-hackathon-3/data/{source}_board_evals/boards.h5"
scores_hdf_path = f"/root/chess-hackathon-3/data/{source}_board_evals/scores.h5"
out_path = f"/root/chess-hackathon-3/data/{source}_board_evals"

batch_size = 2_000_000

boards_hdf = h5py.File(boards_hdf_path, 'r')
scores_hdf = h5py.File(scores_hdf_path, 'r')

boards_buffer = np.empty((0,8,8), dtype=int)
scores_buffer = np.empty((0,), dtype=int)

num_out_files = 0
hdf_sizes = []
hdf_names = []

for key in tqdm(scores_hdf.keys()):

    try:
        game_boards = np.array(boards_hdf[key][:], dtype=int)
        game_scores = np.array(scores_hdf[key][:], dtype=int)
    except:
        print(f"FAILED: {key}")
        continue

    overflow = len(scores_buffer) + len(game_scores) - batch_size

    if overflow >= 0:

        truncation = len(game_scores) - overflow
        boards_buffer = np.append(boards_buffer, game_boards[:truncation], axis=0)
        scores_buffer = np.append(scores_buffer, game_scores[:truncation], axis=0)

        hdf_name = f"evalHDF{num_out_files}"
        out_hdf_path = os.path.join(out_path, hdf_name)
        print(f"SAVING {len(scores_buffer)} to {hdf_name}")
        with h5py.File(out_hdf_path, "w") as out_hdf:
            out_hdf.create_dataset("boards", data=boards_buffer)
            out_hdf.create_dataset("scores", data=scores_buffer)

        num_out_files += 1
        hdf_sizes.append(batch_size)
        hdf_names.append(hdf_name)

        boards_buffer = game_boards[truncation:]
        scores_buffer = game_scores[truncation:]

    else:

        boards_buffer = np.append(boards_buffer, game_boards, axis=0)
        scores_buffer = np.append(scores_buffer, game_scores, axis=0)

if scores_buffer.shape[0] > 0:
    
    hdf_name = f"evalHDF{num_out_files}"
    out_hdf_path = os.path.join(out_path, hdf_name)
    print(f"SAVING {len(scores_buffer)} to {hdf_name}")
    with h5py.File(out_hdf_path, "w") as out_hdf:
        out_hdf.create_dataset("boards", data=boards_buffer)
        out_hdf.create_dataset("scores", data=scores_buffer)

    hdf_sizes.append(scores_buffer.shape[0])
    hdf_names.append(hdf_name)

boards_hdf.close()
scores_hdf.close()

print("Writing inventory.")
dpu.write_inventory(out_path, hdf_sizes, hdf_names)
print("Finished.")

  7%|▋         | 929/13486 [01:03<28:02,  7.46it/s]

SAVING 2000000 to evalHDF0


 14%|█▍        | 1904/13486 [02:12<25:21,  7.61it/s] 

SAVING 2000000 to evalHDF1


 21%|██        | 2857/13486 [03:18<23:15,  7.61it/s]  

SAVING 2000000 to evalHDF2


 28%|██▊       | 3816/13486 [04:22<20:54,  7.71it/s]  

SAVING 2000000 to evalHDF3


 35%|███▌      | 4740/13486 [05:26<18:45,  7.77it/s]  

SAVING 2000000 to evalHDF4


 42%|████▏     | 5639/13486 [06:27<16:44,  7.81it/s]  

SAVING 2000000 to evalHDF5


 49%|████▊     | 6541/13486 [07:29<14:55,  7.75it/s]

SAVING 2000000 to evalHDF6


 55%|█████▌    | 7440/13486 [08:31<13:07,  7.67it/s]

SAVING 2000000 to evalHDF7


 62%|██████▏   | 8328/13486 [09:32<11:08,  7.71it/s]

SAVING 2000000 to evalHDF8


 69%|██████▉   | 9291/13486 [10:38<09:01,  7.75it/s]

SAVING 2000000 to evalHDF9


 76%|███████▌  | 10245/13486 [11:42<07:03,  7.66it/s]

SAVING 2000000 to evalHDF10


 83%|████████▎ | 11208/13486 [12:48<04:58,  7.64it/s]

SAVING 2000000 to evalHDF11


 90%|█████████ | 12147/13486 [13:49<02:55,  7.65it/s]

SAVING 2000000 to evalHDF12


 97%|█████████▋| 13083/13486 [14:52<00:51,  7.87it/s]

SAVING 2000000 to evalHDF13


100%|██████████| 13486/13486 [15:04<00:00, 14.91it/s]


SAVING 813978 to evalHDF14
Writing inventory.
Finished.


In [1]:
from utils.datasets import EVAL_HDF_Dataset

out_path = "/root/chess-hackathon-3/data/lc0_board_evals"
ds = EVAL_HDF_Dataset(out_path)
print(len(ds))

board, eval = ds[0]
print(board)
print(eval)

28813978
tensor([[10,  8,  9, 11, 12,  9,  8, 10],
        [ 7,  7,  7,  7,  7,  7,  7,  7],
        [ 6,  6,  6,  6,  6,  6,  6,  6],
        [ 6,  6,  6,  6,  6,  6,  6,  6],
        [ 6,  6,  6,  6,  6,  6,  6,  6],
        [ 6,  6,  6,  6,  6,  6,  6,  4],
        [ 5,  5,  5,  5,  5,  5,  5,  5],
        [ 2,  4,  3,  1,  0,  3,  6,  2]])
tensor(-64)
