In [1]:
!pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   --------------------- ------------------ 524.3/992.0 kB 4.2 MB/s eta 0:00:01
   ------------------------------- -------- 786.4/992.0 kB 4.8 MB/s eta 0:00:01
   ---------------------------------------- 992.0/992.0 kB 1.7 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [9]:
import sentencepiece as spm

spm.SentencePieceTrainer.Train(
    input="data/train.vi.txt",
    model_prefix='vi_bpe',
    vocab_size=8000,                      # hoặc 16k, 32k tùy size data
    model_type='bpe',                     # hoặc 'unigram'
    bos_id=2,
    eos_id=3,
    pad_id=0,
    unk_id=1,
    user_defined_symbols=["<sos>", "<eos>", "<pad>"]  # optional
)



In [10]:

class SubwordTokenizer:
    def __init__(self, model_path):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(model_path)

        # ID các token đặc biệt
        self.pad_id = self.sp.pad_id()
        self.unk_id = self.sp.unk_id()
        self.bos_id = self.sp.bos_id()
        self.eos_id = self.sp.eos_id()

    def encode(self, text, add_special_tokens=True, max_len=None):
        ids = self.sp.encode(text, out_type=int)
        if add_special_tokens:
            ids = [self.bos_id] + ids + [self.eos_id]
        if max_len:
            ids = ids[:max_len]
        return ids

    def decode(self, ids, skip_special_tokens=True):
        if skip_special_tokens:
            ids = [i for i in ids if i not in [self.pad_id, self.bos_id, self.eos_id]]
        return self.sp.decode(ids)

    def tokenize(self, text):
        return self.sp.encode(text, out_type=str)

    def vocab_size(self):
        return self.sp.get_piece_size()

    def pad_sequence(self, ids, max_len):
        if len(ids) > max_len:
            return ids[:max_len]
        return ids + [self.pad_id] * (max_len - len(ids))


In [11]:
tokenizer = SubwordTokenizer("vi_bpe.model")

text = "Xin chào các bạn khỏe không?"
ids = tokenizer.encode(text)
tokens = tokenizer.tokenize(text)

print("Token IDs:", ids)
print("Tokens:", tokens)
print("Decoded:", tokenizer.decode(ids))


Token IDs: [2, 1066, 1846, 58, 73, 25, 7943, 7906, 62, 7946, 3]
Tokens: ['▁Xin', '▁chào', '▁các', '▁bạn', '▁kh', 'ỏ', 'e', '▁không', '?']
Decoded: Xin chào các bạn khỏe không?
