In [1]:
import os
import pickle
import numpy as np
from glob import glob
from tqdm import tqdm

# ======================================
# SETTINGS
# ======================================
ABC_DIR = "/kaggle/input/abc-notation-cleaned/ABC-4_clean"
OUTPUT_BIN = "/kaggle/working/dataset.bin"
META_FILE = "/kaggle/working/meta.pkl"

# ======================================
# 1. FIRST PASS — Build vocab (streaming)
# ======================================
def build_vocab(folder):
    files = glob(os.path.join(folder, "*.abc"))
    print("Total ABC files found:", len(files))

    chars = set()

    for f in tqdm(files, desc="Building vocab"):
        with open(f, "r", encoding="utf-8", errors="ignore") as fp:
            for line in fp:
                # Remove T: lines if you want cleaner vocab
                if line.startswith("T:"):
                    continue
                chars.update(line)

    unique_chars = sorted(list(chars))
    vocab_size = len(unique_chars)

    print("\n===== FINAL VOCAB =====")
    print("Vocab size:", vocab_size)
    print(unique_chars)
    print("=======================\n")

    stoi = {ch: i for i, ch in enumerate(unique_chars)}
    itos = {i: ch for i, ch in enumerate(unique_chars)}

    return stoi, itos, vocab_size


# ======================================
# 2. SECOND PASS — Encode and write to .bin (streaming)
# ======================================
def encode(folder, stoi, output_path):
    files = glob(os.path.join(folder, "*.abc"))
    total_tokens = 0

    with open(output_path, "wb") as out:
        for f in tqdm(files, desc="Encoding to BIN"):
            with open(f, "r", encoding="utf-8", errors="ignore") as fp:
                for line in fp:
                    # Skip titles
                    if line.startswith("T:"):
                        continue
                    # Convert chars to ints
                    token_ids = [stoi[c] for c in line]
                    arr = np.array(token_ids, dtype=np.uint16)
                    arr.tofile(out)
                    total_tokens += len(arr)

    return total_tokens


# ======================================
# MAIN
# ======================================
if __name__ == "__main__":

    print("\nSTEP 1 — BUILD VOCAB")
    stoi, itos, vocab_size = build_vocab(ABC_DIR)

    print("\nSTEP 2 — ENCODE DATASET")
    total_tokens = encode(ABC_DIR, stoi, OUTPUT_BIN)

    print("\nSaved dataset.bin →", OUTPUT_BIN)
    print("Total tokens:", total_tokens)

    # Save meta info
    meta = {
        "vocab_size": vocab_size,
        "itos": itos,
        "stoi": stoi,
    }
    with open(META_FILE, "wb") as f:
        pickle.dump(meta, f)

    print("\nSaved meta.pkl →", META_FILE)
    print("\nTokenization COMPLETED SUCCESSFULLY")



STEP 1 — BUILD VOCAB
Total ABC files found: 175609


Building vocab: 100%|██████████| 175609/175609 [02:42<00:00, 1082.67it/s]



===== FINAL VOCAB =====
Vocab size: 99
['\t', '\n', '\x0b', '\x0c', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']


STEP 2 — ENCODE DATASET


Encoding to BIN: 100%|██████████| 175609/175609 [47:34<00:00, 61.52it/s] 


Saved dataset.bin → /kaggle/working/dataset.bin
Total tokens: 3844030892

Saved meta.pkl → /kaggle/working/meta.pkl

Tokenization COMPLETED SUCCESSFULLY





In [2]:
import numpy as np
import os
import pickle

# ======================================
# INPUT / OUTPUT PATHS
# ======================================

INPUT_BIN = "/kaggle/working/dataset.bin"    # your big token file
META_FILE = "/kaggle/working/meta.pkl"

TRAIN_BIN = "/kaggle/working/train.bin"
VAL_BIN   = "/kaggle/working/val.bin"
TEST_BIN  = "/kaggle/working/test.bin"

# ======================================
# 1. LOAD VOCAB + META
# ======================================

with open(META_FILE, "rb") as f:
    meta = pickle.load(f)

vocab_size = meta["vocab_size"]
print("Vocab size =", vocab_size)

# ======================================
# 2. MEMORY MAP DATASET (DO NOT LOAD)
# ======================================
print("Memory-mapping dataset...")

data = np.memmap(INPUT_BIN, dtype=np.uint16, mode='r')
N = len(data)

print("Total tokens found:", N)

# Safety check
#assert N > 100_000_000, "Dataset too small. Need 100M+ tokens."

# ======================================
# 3. DEFINE SPLIT SIZES
# ======================================
train_size = int(N * 0.98)
val_size   = int(N * 0.01)
test_size  = N - train_size - val_size  # remaining 1%

print("\nSplit sizes:")
print("Train:", train_size)
print("Val:  ", val_size)
print("Test: ", test_size)

assert train_size >= 100_000_000, "Training split must have 100M+ tokens."

# ======================================
# 4. WRITE SPLITS USING MEMMAP
# ======================================

def save_split(out_path, start, end):
    print(f"Writing {out_path} ...")
    sub = np.memmap(out_path, dtype=np.uint16, mode='w+', shape=(end - start,))
    sub[:] = data[start:end]
    del sub   # flush to disk

# --- Train split ---
save_split(TRAIN_BIN, 0, train_size)

# --- Val split ---
save_split(VAL_BIN, train_size, train_size + val_size)

# --- Test split ---
save_split(TEST_BIN, train_size + val_size, N)

print("\nSplit complete!")
print("Saved:")
print(" →", TRAIN_BIN)
print(" →", VAL_BIN)
print(" →", TEST_BIN)


Vocab size = 99
Memory-mapping dataset...
Total tokens found: 3844030892

Split sizes:
Train: 3767150274
Val:   38440308
Test:  38440310
Writing /kaggle/working/train.bin ...
Writing /kaggle/working/val.bin ...
Writing /kaggle/working/test.bin ...

Split complete!
Saved:
 → /kaggle/working/train.bin
 → /kaggle/working/val.bin
 → /kaggle/working/test.bin
