In [1]:
import sentencepiece as spm

In [2]:
en_model = spm.SentencePieceProcessor(model_file="./en.model")

In [3]:
len(en_model)

16384

In [4]:
special_ids = [
    en_model.unk_id(),
    en_model.pad_id(),
    en_model.bos_id(),
    en_model.eos_id()
]

In [5]:
special_ids

[0, -1, 1, 2]

In [6]:
en_model.decode(en_model.encode("The quick brown fox jumped over the lazy dog."))

'The quick brown fox jumped over the lazy dog.'

In [7]:
class Tokeniser(object):
    def __init__(self, lang: str):
        self.model = spm.SentencePieceProcessor(model_file=f'./{lang}.model')
        self.special_ids = [
            self.model.unk_id(),
            self.model.pad_id(),
            self.model.bos_id(),
            self.model.eos_id()
        ]

    def __len__(self):
        return len(self.model)
    
    def encode_batch(self, sents: list[str], pad_len = None, truncate_len=None):
        return [self.encode(sent, pad_len, truncate_len) for sent in sents]

    def encode(self, sent: str | list[str], pad_len = None, truncate_len=None):
        if type(sent) == list:
            return self.encode_batch(sent, pad_len, truncate_len)
        ids = self.model.encode(sent)
        if pad_len is not None and len(ids) < int(pad_len):
            ids = [*ids, *([self.model.pad_id()] * (int(pad_len) - len(ids)))]
        elif truncate_len is not None and len(ids) > int(truncate_len):
            ids = ids[:int(truncate_len)]
        return ids

    
    def decode(self, ids: list[int]):
        return self.model.decode(list(filter(lambda id: id >= 0 and id < len(self), ids)))
    

In [8]:
en_model = Tokeniser('en')

zh_model = Tokeniser('zh')

In [9]:
len(en_model)

16384

In [10]:
len(zh_model)

16384

In [11]:
ids = en_model.encode("I want to sleep.", pad_len=128)

In [12]:
en_model.decode(ids)

'I want to sleep.'

In [13]:
from tokeniser import BaseBPETokeniser

In [14]:
tokeniser = BaseBPETokeniser()


In [17]:
inputs = tokeniser('I want to sleep.', text_target="我要睡觉。", max_len=128)
inputs

{'input_ids': [36,
  266,
  25,
  2330,
  16259,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [18]:
tokeniser.decode(inputs['labels'])

'我要睡觉。'