In [1]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("tokenizer/unigram_32000_0.9995.model")

print("MASK id:", sp.piece_to_id("[MASK]"))
print("UNK  id:", sp.unk_id())
print("PAD  id:", sp.pad_id())
print("BOS  id:", sp.bos_id())
print("EOS  id:", sp.eos_id())

print(sp.encode("මෙය [MASK] පරීක්ෂණයකි", out_type=str))

MASK id: 4
UNK  id: 1
PAD  id: 0
BOS  id: 2
EOS  id: 3
['▁මෙය', '▁', '[MASK]', '▁පරීක්ෂණය', 'කි']


In [2]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("tokenizer/unigram_32000_0.9995.model")

# Check current IDs
print("PAD:", sp.pad_id())
print("UNK:", sp.unk_id())
print("BOS:", sp.bos_id())
print("EOS:", sp.eos_id())
print("[MASK] current id:", sp.piece_to_id("[MASK]"))

# Assign new unique ID (say 4)
mask_token = "[MASK]"
new_mask_id = 4

# Only if it conflicts with UNK
if sp.piece_to_id(mask_token) == sp.unk_id():
    print("Conflict detected. Remapping MASK id to", new_mask_id)

    # Save old vocab to memory
    vocab = [sp.id_to_piece(i) for i in range(sp.get_piece_size())]

    # Replace MASK piece with the desired ID
    # Step 1: append MASK at the end if needed
    if new_mask_id >= len(vocab):
        vocab.append(mask_token)
    # Step 2: create a mapping from old ID → new ID for MASK
    # This is tricky because SentencePiece doesn't allow changing IDs directly
    # But you can do it at **dataset creation stage**:
    mask_id = new_mask_id
else:
    mask_id = sp.piece_to_id(mask_token)

print("Final MASK id used:", mask_id)


PAD: 0
UNK: 1
BOS: 2
EOS: 3
[MASK] current id: 4
Final MASK id used: 4


In [3]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load('tokenizer/unigram_32000_0.9995.model')

print(f"<unk> ID: {sp.piece_to_id('<unk>')}")
print(f"[MASK] ID: {sp.piece_to_id('[MASK]')}")
print(f"Vocab size: {sp.vocab_size()}")

<unk> ID: 1
[MASK] ID: 4
Vocab size: 32000
