# Tokenizer Learn

In [1]:
from transformers import AutoTokenizer

# 初始化一个分词器
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [2]:
encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
print(encoded_input)

{'input_ids': [101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [3]:
tokenizer.decode(encoded_input["input_ids"])


'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'

In [4]:
encoded_input = tokenizer("Don't meddle in the affairs of wizards, for they are subtle and quick to anger.")
print(encoded_input)

{'input_ids': [101, 1790, 112, 189, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [5]:
tokenizer.decode(encoded_input["input_ids"])


"[CLS] Don ' t meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]"

In [6]:
encoded_input = tokenizer("meddle in the affairs of wizards, for they are subtle and quick to anger.")
print(encoded_input)

{'input_ids': [101, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'

In [8]:
tokenizer.decode([1116])

'##s'

In [9]:
# 一段话
batch_sentences = [ 
    "Do not meddle in the affairs of wizards, for they are subtle and quick to anger.",
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]

In [10]:
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], [101, 1252, 1184, 1164, 1248, 6462, 136, 102], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], [101, 1327, 1164, 5450, 23434, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [11]:
encoded_inputs = tokenizer(batch_sentences, padding=True)
print(encoded_inputs)

{'input_ids': [[101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], [101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)
print(encoded_inputs)

{'input_ids': [[101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], [101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
encoded_inputs = tokenizer("我爱你！")
print(encoded_inputs)

{'input_ids': [101, 100, 100, 100, 1096, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [14]:
tokenizer.decode(encoded_inputs["input_ids"])

'[CLS] [UNK] [UNK] [UNK] ！ [SEP]'

In [15]:
encoded_inputs = tokenizer("今天是2025年3月18日")
print(encoded_inputs)

{'input_ids': [101, 100, 1010, 100, 17881, 1571, 1026, 124, 1037, 1407, 1033, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
tokenizer.decode(encoded_inputs["input_ids"])

'[CLS] [UNK] 天 [UNK] 2025 年 3 月 18 日 [SEP]'

### bert-base-chinese分词器

In [35]:
# 初始化一个bert族 chinese 的分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [36]:
encoded_inputs = tokenizer("我爱你！")
print(encoded_inputs)

{'input_ids': [101, 2769, 4263, 872, 8013, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [37]:
tokenizer.decode(encoded_inputs["input_ids"])

'[CLS] 我 爱 你 ！ [SEP]'

In [38]:
encoded_inputs = tokenizer("今天是2025年3月18日")
print(encoded_inputs)

{'input_ids': [101, 791, 1921, 3221, 8950, 2399, 124, 3299, 8123, 3189, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [21]:
tokenizer.decode(encoded_inputs["input_ids"])

'[CLS] 今 天 是 2025 年 3 月 18 日 [SEP]'

In [39]:
encoded_inputs = tokenizer("九百二十三")
print(encoded_inputs)

{'input_ids': [101, 736, 4636, 753, 1282, 676, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [41]:
tokenizer.decode(encoded_inputs['input_ids'][1])

'九'

### Qwen-2.5-7B-Instruct

In [22]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [23]:
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[5404, 537, 1774, 90385, 304, 279, 21978, 315, 88163, 11, 369, 807, 525, 26447, 323, 3974, 311, 19234, 13], [3983, 1128, 911, 2086, 17496, 30], [8002, 944, 1744, 566, 8788, 911, 2086, 17496, 11, 77382, 13], [3838, 911, 11964, 724, 550, 30]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [25]:
tokenizer.decode(encoded_inputs['input_ids'][0])

'Do not meddle in the affairs of wizards, for they are subtle and quick to anger.'

In [26]:
encoded_inputs = tokenizer("今天是 2025 年 3 月 18 日!")
print(encoded_inputs)

{'input_ids': [100644, 20412, 220, 17, 15, 17, 20, 74577, 112, 220, 18, 220, 9754, 220, 16, 23, 75402, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [27]:
tokenizer.decode(encoded_inputs['input_ids'])

'今天是 2025 年 3 月 18 日!'

In [28]:
encoded_inputs = tokenizer("一百二十三")
print(encoded_inputs)

{'input_ids': [102836, 40820, 100957], 'attention_mask': [1, 1, 1]}


In [29]:
tokenizer.decode(encoded_inputs['input_ids'][0])

'一百'

In [30]:
tokenizer.decode(encoded_inputs['input_ids'][1])

'二'

In [31]:
encoded_inputs = tokenizer("九百二十三")
print(encoded_inputs)

{'input_ids': [99609, 99271, 40820, 100957], 'attention_mask': [1, 1, 1, 1]}


In [33]:
tokenizer.decode(encoded_inputs['input_ids'][0])

'九'

In [34]:
tokenizer.decode(encoded_inputs['input_ids'][1])

'百'

### Qwen2.5-0.5B-Instruct
我突然就有一个问题，Qwen-2.5-7B-Instruct 和 Qwen2.5-0.5B-Instruct 使用的是一个Tokenizer吗？我觉得应该是一个。

In [42]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [43]:
encoded_inputs = tokenizer("今天是 2025 年 3 月 18 日!")
print(encoded_inputs)

{'input_ids': [100644, 20412, 220, 17, 15, 17, 20, 74577, 112, 220, 18, 220, 9754, 220, 16, 23, 75402, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


看来Qwen2.5使用的都是一个分词器;该分词器是一个多语言的分词器（至少包含英语和汉语）

In [44]:
tokenizer.decode(encoded_inputs['input_ids'])

'今天是 2025 年 3 月 18 日!'

In [45]:
encoded_inputs = tokenizer("九百二十三")
print(encoded_inputs)

{'input_ids': [99609, 99271, 40820, 100957], 'attention_mask': [1, 1, 1, 1]}


### Qwen2-0.5B-Instruct
我又想到一个新的问题，Qwen2.5-7B-Instruct和Qwen2.5-0.5B-Instruct使用的是一个Tokenizer，这是因为它们都是Qwen2.5系列的；那么Qwen2和Qwen2.5之间的Tokenizer是否是一样的？我认为可能是不一样的；

In [46]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [47]:
encoded_inputs = tokenizer("今天是 2025 年 3 月 18 日!")
print(encoded_inputs)

{'input_ids': [100644, 20412, 220, 17, 15, 17, 20, 74577, 112, 220, 18, 220, 9754, 220, 16, 23, 75402, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Qwen2.5-0.5B:{'input_ids': [100644, 20412, 220, 17, 15, 17, 20, 74577, 112, 220, 18, 220, 9754, 220, 16, 23, 75402, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
看起来是一样的！Qwen2和Qwen2.5的Tokenizer应该是一样的？

In [48]:
tokenizer.decode(encoded_inputs['input_ids'])

'今天是 2025 年 3 月 18 日!'

In [49]:
encoded_inputs = tokenizer("九百二十三")
print(encoded_inputs)

{'input_ids': [99609, 99271, 40820, 100957], 'attention_mask': [1, 1, 1, 1]}


In [52]:
encoded_inputs = tokenizer.tokenize("九百二十三")
print(encoded_inputs)

['ä¹Ŀ', 'çĻ¾', 'äºĮ', 'åįģä¸ī']


In [53]:
tokenizer.convert_tokens_to_ids(encoded_inputs)

[99609, 99271, 40820, 100957]