In [1]:
import sentencepiece as spm
import json
import os
from tqdm import tqdm

# -------- CONFIG --------
MODEL_PATH = "tokenizer/unigram_32000_0.9995.model"
INPUT_TXT = "input.txt"
OUTPUT_JSONL = "tokenized_output.jsonl"

ADD_EOS = True
# ------------------------

assert os.path.exists(MODEL_PATH), f"Tokenizer not found: {MODEL_PATH}"
assert os.path.exists(INPUT_TXT), f"Input file not found: {INPUT_TXT}"

# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load(MODEL_PATH)

eos_id = sp.eos_id()

print("Tokenizer loaded")
print(f"Vocab size: {sp.get_piece_size()}")
print(f"EOS id: {eos_id}")

if ADD_EOS and eos_id == -1:
    raise ValueError("Tokenizer has no EOS token!")

with open(INPUT_TXT, "r", encoding="utf-8") as fin, \
     open(OUTPUT_JSONL, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, desc="Tokenizing"):
        line = line.strip()
        if not line:
            continue

        token_ids = sp.encode(line, out_type=int)

        if ADD_EOS:
            token_ids.append(eos_id)

        record = {
            "input_ids": token_ids
        }

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Tokenization completed.")
print(f"Output written to: {OUTPUT_JSONL}")

Tokenizer loaded
Vocab size: 32000
EOS id: 3


Tokenizing: 13it [00:00, 51102.11it/s]

Tokenization completed.
Output written to: tokenized_output.jsonl





In [3]:
import json
import torch
import random
from tqdm import tqdm

# -------- CONFIG --------
TOKENIZED_JSONL = "tokenized_output.jsonl"
OUTPUT_PT = "train_blocks.pt"
CONTEXT_LEN = 32
SHUFFLE = True
SEED = 42
# ------------------------

random.seed(SEED)

print("Reading tokenized dataset...")

token_stream = []

with open(TOKENIZED_JSONL, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading"):
        record = json.loads(line)
        token_stream.extend(record["input_ids"])

print(f"Total tokens: {len(token_stream):,}")

# Drop remainder to avoid padding
usable_len = (len(token_stream) // CONTEXT_LEN) * CONTEXT_LEN
token_stream = token_stream[:usable_len]

print(f"Usable tokens: {len(token_stream):,}")
print(f"Total blocks: {len(token_stream) // CONTEXT_LEN:,}")

# Build blocks
blocks = []
for i in range(0, len(token_stream), CONTEXT_LEN):
    chunk = token_stream[i:i + CONTEXT_LEN]
    input_ids = chunk[:-1]
    labels = chunk[1:]
    blocks.append({
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long)
    })

if SHUFFLE:
    print("Shuffling blocks...")
    random.shuffle(blocks)

print("Saving dataset...")
torch.save(blocks, OUTPUT_PT)

print("Done.")
print(f"Saved to: {OUTPUT_PT}")

Reading tokenized dataset...


Loading: 13it [00:00, 186095.40it/s]

Total tokens: 226
Usable tokens: 224
Total blocks: 7
Shuffling blocks...
Saving dataset...
Done.
Saved to: train_blocks.pt



