In [1]:
from transformers import AutoTokenizer

In [2]:
sen = '我爱北京天安门'

In [3]:
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
# tokenizer.save_pretrained("./roberta_tokenizer")

In [5]:
tokens = tokenizer.tokenize(sen)
tokens

['我', '爱', '北', '京', '天', '安', '门']

In [6]:
tokenizer.vocab

{'贈': 6557,
 'p10': 10405,
 '濠': 4090,
 'no1': 11448,
 '为': 711,
 '##瓷': 17544,
 '劍': 1210,
 'low': 10611,
 '佰': 880,
 '##擒': 16141,
 '亮': 778,
 '##亞': 13822,
 '暂': 3257,
 '##哼': 14587,
 '##牡': 17342,
 '衾': 6143,
 '##諸': 19385,
 '##塚': 14909,
 '狄': 4313,
 '##菲': 18895,
 '瞳': 4749,
 '##卟': 14360,
 '##痺': 17649,
 'today': 11262,
 '娠': 2027,
 '燃': 4234,
 '##豆': 19543,
 'tb': 11456,
 '##db': 9123,
 '265': 8689,
 '琼': 4437,
 '按': 2902,
 '歪': 3639,
 'wei': 11875,
 '##room': 12193,
 '脣': 5560,
 '册': 1085,
 '骷': 7759,
 '娥': 2029,
 'fintech': 12234,
 'はい': 9781,
 '##絮': 18242,
 '##橹': 16644,
 '阁': 7323,
 '₄': 358,
 '##纪': 18336,
 '##岑': 15320,
 '##ふ': 13671,
 '襁': 6197,
 '锻': 7248,
 '謎': 6336,
 '##穌': 18004,
 '鲁': 7826,
 '01': 8146,
 '##怜': 15646,
 '阿': 7350,
 '##裏': 19223,
 '##駐': 20745,
 '##ico': 10641,
 '##甕': 17547,
 '着': 4708,
 '邃': 6916,
 '貌': 6505,
 'psp': 11337,
 '##腿': 18654,
 '##漢': 17088,
 '##笹': 18076,
 'inparadise': 12484,
 '鸾': 7895,
 'be': 8815,
 'match': 12528,
 '##台': 14435,
 '

In [7]:
tokenizer.vocab_size

21128

In [8]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2769, 4263, 1266, 776, 1921, 2128, 7305]

In [9]:
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['我', '爱', '北', '京', '天', '安', '门']

In [10]:
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'我 爱 北 京 天 安 门'

# Encode Eecode

In [11]:
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102]

In [12]:
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 我 爱 北 京 天 安 门 [SEP]'

# Padding and truncation

> padding

In [13]:
ids = tokenizer.encode(sen, padding='max_length', max_length=15)
ids

[101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102, 0, 0, 0, 0, 0, 0]

> truncation

In [14]:
ids = tokenizer.encode(sen, truncation=True, max_length=5)
ids

[101, 2769, 4263, 1266, 102]

# Other

In [15]:
ids = tokenizer.encode_plus(sen, padding='max_length', max_length=15)
ids

{'input_ids': [101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

In [16]:
inputs = tokenizer(sen, padding='max_length', max_length=15, truncation=True)
inputs

{'input_ids': [101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

# Batch data

In [17]:
sens = ['我爱北京天安门', '我爱上海东方明珠', '我爱广州塔', 
         '我爱深圳平安大厦', '我爱成都大熊猫基地']
res = tokenizer(sens)
res['input_ids'], res['token_type_ids'], res['attention_mask']

([[101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102],
  [101, 2769, 4263, 677, 3862, 691, 3175, 3209, 4403, 102],
  [101, 2769, 4263, 2408, 2336, 1849, 102],
  [101, 2769, 4263, 3918, 1766, 2398, 2128, 1920, 1336, 102],
  [101, 2769, 4263, 2768, 6963, 1920, 4220, 4344, 1825, 1765, 102]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

# Fast/Slow Tokenizer

In [18]:
sen = '最渺小的我有大大的Dreaming!'

In [19]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=True)
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [20]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [21]:
%%time
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 272 ms, sys: 3.77 ms, total: 276 ms
Wall time: 276 ms


In [22]:
%%time
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 616 ms, sys: 5.55 ms, total: 621 ms
Wall time: 623 ms


In [23]:
%%time
fast_tokenizer([sen]*10000)

CPU times: user 333 ms, sys: 19.6 ms, total: 353 ms
Wall time: 63.9 ms


{'input_ids': [[101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 1

In [24]:
%%time
slow_tokenizer([sen]*10000)

CPU times: user 600 ms, sys: 6.02 ms, total: 606 ms
Wall time: 608 ms


{'input_ids': [[101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 10252, 8221, 106, 102], [101, 3297, 3953, 2207, 4638, 2769, 3300, 1920, 1920, 4638, 1

In [25]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs['offset_mapping']

[(0, 0),
 (0, 1),
 (1, 2),
 (2, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (6, 7),
 (7, 8),
 (8, 9),
 (9, 14),
 (14, 17),
 (17, 18),
 (0, 0)]

In [26]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, None]

# Special Tokenizers

In [27]:
from transformers import AutoTokenizer

In [28]:
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

You are using the legacy behaviour of the <class 'transformers_modules.Skywork.Skywork-13B-base.bc35915066fbbf15b77a1a4a74e9b574ab167816.tokenization_skywork.SkyworkTokenizer'>. This means that tokens that come after special tokens will not be properly handled. 


SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [29]:
# tokenizer.save_pretrained("skywork_tokenizer")
# tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [30]:
tokenizer.decode(tokenizer.encode(sen))

'<s>最渺小的我有大大的Dreaming!'