In [6]:
import lmdb
import os
from tqdm import tqdm  

input_lmdb = "chess_dataset.lmdb"
output_dir = "chess_shards"
shard_size = 2_000_000 
shard_map_size_gb = 10

os.makedirs(output_dir, exist_ok=True)

# Open input LMDB
env_in = lmdb.open(input_lmdb, readonly=True, lock=False)
with env_in.begin() as txn:
    total_records = txn.stat()['entries']

print(f"Total records: {total_records}")

shard_idx = 0
key_counter = 0
env_out = None
txn_out = None

with env_in.begin() as txn_in:
    for i in tqdm(range(total_records), desc="Sharding LMDB"):
        if key_counter % shard_size == 0:
            # Close previous shard
            if txn_out:
                txn_out.commit()
                env_out.close()
            # Open new shard
            shard_path = os.path.join(output_dir, f"chess_dataset_shard_{shard_idx}.lmdb")
            env_out = lmdb.open(shard_path, map_size=shard_map_size_gb * (1024**3))
            txn_out = env_out.begin(write=True)
            shard_idx += 1
            print(f"Starting shard {shard_idx} at record {i}")

        key = i.to_bytes(8, "big")
        val = txn_in.get(key)
        if val:
            txn_out.put(key, val)

        # Commit every 100k for safety
        if key_counter % 100_000 == 0 and key_counter > 0:
            txn_out.commit()
            txn_out = env_out.begin(write=True)

        key_counter += 1

# Commit last shard
if txn_out:
    txn_out.commit()
if env_out:
    env_out.close()

print(f"Done! Created {shard_idx} shards in {output_dir}")

Total records: 21235397


Sharding LMDB:   0%|                                                                           | 21019/21235397 [00:00<01:41, 209141.58it/s]

Starting shard 1 at record 0


Sharding LMDB:  10%|███████                                                                   | 2028929/21235397 [00:12<03:37, 88199.06it/s]

Starting shard 2 at record 2000000


Sharding LMDB:  19%|██████████████                                                            | 4026300/21235397 [00:27<03:46, 75845.23it/s]

Starting shard 3 at record 4000000


Sharding LMDB:  28%|████████████████████▉                                                     | 6023734/21235397 [00:42<03:34, 71072.92it/s]

Starting shard 4 at record 6000000


Sharding LMDB:  38%|███████████████████████████▋                                             | 8055066/21235397 [00:54<01:14, 177408.43it/s]

Starting shard 5 at record 8000000


Sharding LMDB:  47%|█████████████████████████████████▉                                      | 10024058/21235397 [01:04<01:08, 163067.53it/s]

Starting shard 6 at record 10000000


Sharding LMDB:  57%|████████████████████████████████████████▊                               | 12027262/21235397 [01:15<00:48, 188569.82it/s]

Starting shard 7 at record 12000000


Sharding LMDB:  66%|███████████████████████████████████████████████▌                        | 14045205/21235397 [01:26<00:34, 205958.13it/s]

Starting shard 8 at record 14000000


Sharding LMDB:  76%|██████████████████████████████████████████████████████▍                 | 16037858/21235397 [01:36<00:28, 184523.75it/s]

Starting shard 9 at record 16000000


Sharding LMDB:  85%|█████████████████████████████████████████████████████████████▏          | 18033083/21235397 [01:46<00:17, 185708.71it/s]

Starting shard 10 at record 18000000


Sharding LMDB:  94%|███████████████████████████████████████████████████████████████████▉    | 20032579/21235397 [01:55<00:06, 181966.05it/s]

Starting shard 11 at record 20000000


Sharding LMDB: 100%|████████████████████████████████████████████████████████████████████████| 21235397/21235397 [02:01<00:00, 174340.70it/s]

Done! Created 11 shards in chess_shards





In [None]:
from google.colab import drive
drive.mount('/content/drive')

commit_dir = "chess_shards"

idx = int(input("Which shard would you like to upload?"))

