In [5]:
import lmdb
import os
from tqdm import tqdm  

input_lmdb = "chess_dataset.lmdb"
output_dir = "chess_shards"
shard_size = 2_000_000 
shard_map_size_gb = 10

os.makedirs(output_dir, exist_ok=True)

# Open input LMDB
env_in = lmdb.open(input_lmdb, readonly=True, lock=False)
with env_in.begin() as txn:
    total_records = txn.stat()['entries']

print(f"Total records: {total_records}")

shard_idx = 0
key_counter = 0
env_out = None
txn_out = None

with env_in.begin() as txn_in:
    for i in tqdm(range(total_records), desc="Sharding LMDB"):
        if key_counter % shard_size == 0:
            # Close previous shard
            if txn_out:
                txn_out.commit()
                env_out.close()
            # Open new shard
            shard_path = os.path.join(output_dir, f"chess_dataset_shard_{shard_idx}.lmdb")
            env_out = lmdb.open(shard_path, map_size=shard_map_size_gb * (1024**3))
            txn_out = env_out.begin(write=True)
            shard_idx += 1
            print(f"Starting shard {shard_idx} at record {i}")

        key = i.to_bytes(8, "big")
        val = txn_in.get(key)
        if val:
            txn_out.put(key, val)

        # Commit every 100k for safety
        if key_counter % 100_000 == 0 and key_counter > 0:
            txn_out.commit()
            txn_out = env_out.begin(write=True)

        key_counter += 1

# Commit last shard
if txn_out:
    txn_out.commit()
if env_out:
    env_out.close()

print(f"Done! Created {shard_idx} shards in {output_dir}")

Total records: 21235397


NameError: name 'tqdm' is not defined