In [1]:
# 导入库
from transformers import AutoTokenizer

In [2]:
# 加载 Qwen tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", use_fast=False)

In [3]:
# 查看 Qwen tokenizer 基础词汇表大小
tokenizer.vocab_size

151643

In [4]:
# 词汇表中填充标记类型的 ID;
tokenizer.pad_token_type_id

0

In [5]:
# 将特殊标记类属性映射到其值的字典
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [6]:
# 特殊标记的列表
tokenizer.all_special_tokens

['<|im_end|>',
 '<|endoftext|>',
 '<|im_start|>',
 '<|object_ref_start|>',
 '<|object_ref_end|>',
 '<|box_start|>',
 '<|box_end|>',
 '<|quad_start|>',
 '<|quad_end|>',
 '<|vision_start|>',
 '<|vision_end|>',
 '<|vision_pad|>',
 '<|image_pad|>',
 '<|video_pad|>']

In [7]:
# 列出映射到类属性的特殊标记的 ID
tokenizer.all_special_ids

[151645,
 151643,
 151644,
 151646,
 151647,
 151648,
 151649,
 151650,
 151651,
 151652,
 151653,
 151654,
 151655,
 151656]

In [8]:
# 可以输入到模型的句子的最大长度；
tokenizer.max_len_single_sentence

131072

In [9]:
tokenizer.vocab

{'Ġplaster': 61927,
 'Saint': 56375,
 '.Role': 35955,
 'ladÄ±ÄŁÄ±': 133786,
 'á»¹': 125093,
 'çĶµåĬ¨': 100931,
 'ëª»': 128186,
 'ĠUIStoryboardSegue': 69160,
 '.crt': 93869,
 'Ġabdomen': 63672,
 'æŁĬ': 122434,
 'Ġkapsam': 135700,
 'è¯ĬæĸŃ': 105262,
 'ðŁĺł': 145932,
 'ĠDISPATCH': 94581,
 'Ġaims': 21538,
 'ä¿ĥè¿ĽäºĨ': 109357,
 'ä¸įæĩĪ': 103749,
 'ä¸īæĺŁ': 106258,
 'èĶº': 121373,
 'InstanceOf': 15040,
 '_from': 5673,
 'å¸ħåĵ¥': 117637,
 '<lemma': 23543,
 'Ġbuff': 11522,
 '_targets': 41997,
 'åıĳæĶ¾': 104752,
 'âĢĶas': 59554,
 'ë©İ': 146213,
 'ĠJDK': 97425,
 'Ġcredibility': 37669,
 'abbr': 71276,
 'CLUSIVE': 54036,
 'ÙĬÙĦÙĬ': 132063,
 'ãĤ¤ãĥ³ãĥī': 141143,
 'è¿ĺåĮħæĭ¬': 114735,
 'AppBundle': 72888,
 'æ´ļ': 122528,
 'ìĽĮ': 130109,
 'Ġoutros': 56873,
 'Ġqueued': 57163,
 'å°´': 102731,
 '.news': 44503,
 '.side': 48631,
 'Ø§Ùĥ': 128559,
 'ç¬ĳè¯Ŀ': 109959,
 '_versions': 65148,
 'Ġfalsehood': 95018,
 'Ġwines': 42755,
 '(mon': 64518,
 'Ġhatten': 83572,
 'å°Ĩ': 44063,
 'Ġpathetic': 66063,
 'ĠÐĽ': 12

In [10]:
# [PAD] 填充标记
tokenizer.pad_token

'<|endoftext|>'

In [11]:
# [CLS] 起始标记
tokenizer.cls_token

In [12]:
# [SEP] 分隔标记
tokenizer.sep_token

In [13]:
# [UNK] 未知标记
tokenizer.unk_token

In [14]:
# [MASK] 掩码标记
tokenizer.mask_token

In [15]:
text = "你好 Qwen"
tokens = tokenizer.tokenize(text)
print(tokens)

['ä½łå¥½', 'ĠQ', 'wen']


In [16]:
input_ids = tokenizer.convert_tokens_to_ids(tokens) 
print(input_ids)

[108386, 1207, 16948]


In [19]:
# 编码    tokenizer
input_ids = tokenizer.encode(text, add_special_tokens=True)
print(input_ids)

[108386, 1207, 16948]


In [24]:
encoded_inputs = tokenizer.encode_plus(text, return_tensors="pt")
print(encoded_inputs)

{'input_ids': tensor([[108386,   1207,  16948]]), 'attention_mask': tensor([[1, 1, 1]])}


In [26]:
# 批量分词编码
sentences = ["Hello, Qwen!", "你好，千问!"]
batch_inputs = tokenizer.batch_encode_plus(sentences, padding=True, truncation=True)
print(batch_inputs)

{'input_ids': [[9707, 11, 1207, 16948, 0], [108386, 3837, 99320, 56007, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


In [28]:
# 批量分词
batch_tokens = tokenizer.tokenize(sentences)
print(batch_tokens)

['Hello', ',', 'ĠQ', 'wen', '!', 'ä½łå¥½', 'ï¼Į', 'åįĥ', 'éĹ®', '!']


```python
def encode(
	self,
	text: Union[TextInput, PreTokenizedInput, EncodedInput],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = None,
	max_length: Optional[int] = None,
	stride: int = 0,
	padding_side: Optional[bool] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	**kwargs,
	) -> List[int]:
	"""
	Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
	Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
	Args:
	text (`str`, `List[str]` or `List[int]`):
	The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method).
	text_pair (`str`, `List[str]` or `List[int]`, *optional*): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` method).
	"""
	encoded_inputs = self.encode_plus(
	text,
	text_pair=text_pair,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	padding_side=padding_side,
	return_tensors=return_tensors,
	**kwargs,
	)

  return encoded_inputs["input_ids"]
```

encode函数在内部调用的了 encode_plus函数；其功能可以理解为`encode`函数=`tokenize()`函数+`convert_tokens_to_ids()`函数

In [35]:
# token_ids <-> tokens
text = "你好，Qwen！"
tokens = tokenizer.tokenize(text)
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("tokenize: ", tokens)
print("convert tokens to ids: ", tokens_ids)
encoded_inputs = tokenizer(text)
print("encoed text: ", encoded_inputs)
reversed_tokens = tokenizer.convert_ids_to_tokens(tokens_ids)
print("convert ids to tokens: ", reversed_tokens)
decoded_inputs = tokenizer.decode(tokens_ids)
print("decode text: ", decoded_inputs)


tokenize:  ['ä½łå¥½', 'ï¼Į', 'Q', 'wen', 'ï¼ģ']
convert tokens to ids:  [108386, 3837, 48, 16948, 6313]
encoed text:  {'input_ids': [108386, 3837, 48, 16948, 6313], 'attention_mask': [1, 1, 1, 1, 1]}
convert ids to tokens:  ['ä½łå¥½', 'ï¼Į', 'Q', 'wen', 'ï¼ģ']
decode text:  你好，Qwen！


In [31]:
# 解码  token_id -> 自然语言
decoded_inputs = tokenizer.decode(input_ids)
print(decoded_inputs)

你好 Qwen


In [32]:
# 批量解码？
decoded_inputs = tokenizer.decode(batch_inputs['input_ids'])
# 看起来没有批量解码，

TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [36]:
# 批量解码需要自己实现
for ids_list in batch_inputs['input_ids']:
    decoded_inputs = tokenizer.decode(ids_list)
    print(decoded_inputs)

Hello, Qwen!
你好，千问!


In [38]:
# 添加新的Token
new_tokens = ["[NEW_TOKEN]"]
tokenizer.add_tokens(new_tokens)

1