# 了解各家分词器的能力

In [1]:
# 导入库
from transformers import AutoTokenizer

In [6]:
# 字节编码
def bytes_to_unicode():
    """
    生成256个编码符号  0000 0000 -> 1111 1111
    返回字典：{byte_value: unicode_char}
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

In [7]:
# 得到逆转映射
def get_reverse_map(forward_map):
    """
    根据正向映射生成反向映射
    返回字典：{unicode_char: byte_value}
    """
    return {v: k for k, v in forward_map.items()}

In [8]:
# byte code 到 unicode str
def bytes_to_unicode_str(byte_sequence):
    return ''.join([forward_map[b] for b in byte_sequence])

# unicode str 到 byte code
def unicode_str_to_bytes(unicode_str):
    return bytes([reverse_map[c] for c in unicode_str])

In [9]:
# 正向映射表
forward_map = bytes_to_unicode()
# 反向映射表
reverse_map = get_reverse_map(forward_map)

## DeepSeek

In [3]:
# 导入模型
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3-0324")

In [4]:
# 查看词汇库大小
vocab_size = tokenizer.vocab_size
print("vocab_size: ", vocab_size)

vocab_size:  128000


In [5]:
# tokenize
tokens = tokenizer.tokenize("你好，世界！")
print("tokens: ", tokens)

tokens:  ['ä½łå¥½', 'ï¼Į', 'ä¸ĸçķĮ', 'ï¼ģ']


In [10]:
texts = []
for token in tokens:
    byte_data = unicode_str_to_bytes(token)
    text = byte_data.decode("utf-8")
    texts.append(text)
print(texts)

['你好', '，', '世界', '！']


In [11]:
# 查看特殊 token 列表
special_tokens = tokenizer.all_special_tokens
print(special_tokens)

['<｜begin▁of▁sentence｜>', '<｜end▁of▁sentence｜>']


In [12]:
# 可以输入到模型的句子的最大长度
max_len = tokenizer.max_len_single_sentence
print(max_len)

131071


In [None]:
# 代码报错信息
text = "AttributeError: module 'scipy.misc' has no attribute 'toimage'"
tokens = tokenizer.tokenize(text)
print(tokens)

## Qwen2.5

In [13]:
# 导入模型
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

In [14]:
# 查看词汇库大小
vocab_size = tokenizer.vocab_size
print("vocab_size: ", vocab_size)

vocab_size:  151643


In [15]:
# tokenize
tokens = tokenizer.tokenize("你好，世界！")
print("tokens: ", tokens)

tokens:  ['ä½łå¥½', 'ï¼Į', 'ä¸ĸçķĮ', 'ï¼ģ']


In [16]:
texts = []
for token in tokens:
    byte_data = unicode_str_to_bytes(token)
    text = byte_data.decode("utf-8")
    texts.append(text)
print(texts)

['你好', '，', '世界', '！']


In [17]:
# 查看特殊 token 列表
special_tokens = tokenizer.all_special_tokens
print(special_tokens)

['<|im_end|>', '<|endoftext|>', '<|im_start|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']


In [18]:
# 可以输入到模型的句子的最大长度
max_len = tokenizer.max_len_single_sentence
print(max_len)

131072


In [20]:
# 代码报错信息
text = "AttributeError: module 'scipy.misc' has no attribute 'toimage'"
tokens = tokenizer.tokenize(text)
print(tokens)

['Attribute', 'Error', ':', 'Ġmodule', "Ġ'", 'sc', 'ipy', '.misc', "'", 'Ġhas', 'Ġno', 'Ġattribute', "Ġ'", 'to', 'image', "'"]


In [21]:
# 编码
encoded_inputs = tokenizer.encode(text)
print(encoded_inputs)

[3907, 1454, 25, 4688, 364, 2388, 22947, 58753, 6, 702, 902, 7035, 364, 983, 1805, 6]


In [23]:
# 解码
decoded_inputs = tokenizer.decode(encoded_inputs)
print(decoded_inputs)

AttributeError: module 'scipy.misc' has no attribute 'toimage'


## meta-llama

In [38]:
# 导入模型
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct.
403 Client Error. (Request ID: Root=1-67ea7dab-7a40e64e4649fecc3bd514b3;b51de765-b849-4237-8a3e-2b0e6fcf35cc)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/resolve/main/config.json.
Your request to access model meta-llama/Llama-3.3-70B-Instruct has been rejected by the repo's authors.

## google/gemma-3-27b-it

In [27]:
# 导入模型
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-27b-it")

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [28]:
# 查看词汇库大小
vocab_size = tokenizer.vocab_size
print("vocab_size: ", vocab_size)

vocab_size:  262144


In [29]:
# tokenize
tokens = tokenizer.tokenize("你好，世界！")
print("tokens: ", tokens)

tokens:  ['你好', '，', '世界', '！']


In [33]:
# 查看特殊 token 列表
special_tokens = tokenizer.all_special_tokens
print(special_tokens)

['<bos>', '<eos>', '<unk>', '<pad>', '<start_of_image>', '<end_of_image>', '<image_soft_token>']


In [34]:
# 可以输入到模型的句子的最大长度
max_len = tokenizer.max_len_single_sentence
print(max_len)

1000000000000000019884624838655


In [35]:
# 代码报错信息
text = "AttributeError: module 'scipy.misc' has no attribute 'toimage'"
tokens = tokenizer.tokenize(text)
print(tokens)

['Attribute', 'Error', ':', '▁module', "▁'", 'sc', 'ipy', '.', 'misc', "'", '▁has', '▁no', '▁attribute', "▁'", 'to', 'image', "'"]


In [36]:
# 编码
encoded_inputs = tokenizer.encode(text)
print(encoded_inputs)

[2, 9392, 3494, 236787, 9173, 756, 1166, 40493, 236761, 74716, 236789, 815, 951, 9176, 756, 1071, 3304, 236789]


In [37]:
# 解码
decoded_inputs = tokenizer.decode(encoded_inputs)
print(decoded_inputs)

<bos>AttributeError: module 'scipy.misc' has no attribute 'toimage'
