In [1]:
import sentencepiece as spm
import json
import os
from tqdm import tqdm

# -------- CONFIG --------
MODEL_PATH = "tokenizer/unigram_32000_0.9995.model"
INPUT_TXT = "input.txt"
OUTPUT_JSONL = "tokenized_output.jsonl"

ADD_EOS = True
# ------------------------

assert os.path.exists(MODEL_PATH), f"Tokenizer not found: {MODEL_PATH}"
assert os.path.exists(INPUT_TXT), f"Input file not found: {INPUT_TXT}"

# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load(MODEL_PATH)

eos_id = sp.eos_id()

print("Tokenizer loaded")
print(f"Vocab size: {sp.get_piece_size()}")
print(f"EOS id: {eos_id}")

if ADD_EOS and eos_id == -1:
    raise ValueError("Tokenizer has no EOS token!")

with open(INPUT_TXT, "r", encoding="utf-8") as fin, \
     open(OUTPUT_JSONL, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, desc="Tokenizing"):
        line = line.strip()
        if not line:
            continue

        token_ids = sp.encode(line, out_type=int)

        if ADD_EOS:
            token_ids.append(eos_id)

        record = {
            "input_ids": token_ids
        }

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Tokenization completed.")
print(f"Output written to: {OUTPUT_JSONL}")

Tokenizer loaded
Vocab size: 32000
EOS id: 3


Tokenizing: 13it [00:00, 51102.11it/s]

Tokenization completed.
Output written to: tokenized_output.jsonl



