# Tokenizer简介
## 数据预处理
1. 对文本数据进行分词
2. 根据分词结果构建字典
3. 根据字典进行数据映射，将文本序列转换为数字序列
4. 数据填充和截断，保证数据长度满足模型输入可接受范围，并且batch内维度一致

# Tokenizer基本使用

In [1]:
from transformers import AutoTokenizer

In [3]:
sentence = "弱小的我也有大梦想"

## Step1 加载与保存

In [2]:
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")    # 也可传入本地路径
tokenizer
# 保存分词器模型至本地
# tokenizer.save_pretrained("local_path")

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## Step2 句子分词

In [4]:
token = tokenizer.tokenize(sentence)
token

['弱', '小', '的', '我', '也', '有', '大', '梦', '想']

## Step3 查看词典

In [5]:
tokenizer.vocab

{'##戈': 15819,
 '##瞅': 17788,
 '妝': 1976,
 '柱': 3393,
 '厥': 1335,
 '##ob': 11047,
 '##ven': 10844,
 '##摘': 16093,
 '亏': 755,
 '斗': 3159,
 '楚': 3504,
 '区': 1277,
 '205': 9860,
 '##澆': 17125,
 '##仔': 13855,
 '軒': 6726,
 '##茶': 18820,
 'action': 10805,
 '善': 1587,
 '蕈': 5932,
 '痤': 4583,
 '谍': 6452,
 '取': 1357,
 '譲': 6360,
 '彤': 2502,
 '##挣': 15971,
 '幟': 2392,
 '365': 8728,
 '##霑': 20512,
 'say': 10114,
 '濠': 4090,
 '扔': 2803,
 '矽': 4769,
 '署': 5392,
 'ｈ': 8058,
 '秀': 4899,
 'oa': 10527,
 '禦': 4888,
 '##翰': 18489,
 '俠': 927,
 '##zo': 10121,
 '绷': 5338,
 '##厄': 14380,
 '##誰': 19363,
 'ktv': 8894,
 '拘': 2872,
 '捅': 2927,
 '革': 7484,
 '##卅': 14341,
 '##lr': 10712,
 '##ls': 8916,
 '胸': 5541,
 '##肺': 18568,
 '皇': 4640,
 '駄': 7685,
 'm3': 9305,
 'lost': 12593,
 '協': 1295,
 '积': 4916,
 'hair': 11408,
 'reuters': 11778,
 '髪': 7772,
 '##趕': 19691,
 '勉': 1236,
 '腾': 5596,
 '巻': 2351,
 'phone': 8922,
 'ob': 12639,
 '樽': 3574,
 '##ct': 8722,
 '##续': 18387,
 '๑': 285,
 '胁': 5516,
 'ios': 8276,
 '罡': 

In [6]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [7]:
ids = tokenizer.convert_tokens_to_ids(token)
token = tokenizer.convert_ids_to_tokens(ids)
string = tokenizer.convert_tokens_to_string(token)

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682]

In [12]:
# 更加便捷的数据处理方式
ids = tokenizer.encode(sentence)
# ids = tokenizer.encode(sentence, add_special_tokens=False)  # 不显示特殊的编码结果
# 下述结果的开头结尾多了101和102，为句子开始和结束的标志，不同分词模型的结果不同
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102]

In [13]:
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想 [SEP]'

# Step5 填充与截断

In [16]:
ids = tokenizer.encode(sentence, padding="max_length", max_length=15)
print(ids)
ids = tokenizer.encode(sentence, max_length=5, truncation=True)     # 这个长度包含了起始和终止符
print(ids)

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102, 0, 0, 0, 0]
[101, 2483, 2207, 4638, 102]


# Step6 其它输入部分

In [21]:
ids = tokenizer.encode(sentence, padding="max_length", max_length=15)
print(ids)

# 计算attention mask--用于盖住填充的部分
attention_mask = [1 if idx != 0 else 0 for idx in ids]
print(attention_mask)
token_type_ids = [0] * len(ids)     # 看这个分词属于哪个句子
print(token_type_ids)

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [23]:
# 快速调用方式
inputs = tokenizer.encode_plus(sentence, padding="max_length", max_length=15)
inputs_2 = tokenizer(sentence, padding="max_length", max_length=15)
inputs, inputs_2

({'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]},
 {'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]})

## Step8 处理batch数据

In [2]:
sentences = ["弱小的我也有大梦想",
             "有梦想谁都了不起",
             "追逐梦想的心。梦想本身，更可贵"]
res = tokenizer(sentences)
res

NameError: name 'tokenizer' is not defined

# Fast / Slow Tokenizer

In [6]:
test = "弱小的我也有大Dreaming!"

In [3]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
%%time
for i in range(10000):
    fast_tokenizer(test)

CPU times: total: 125 ms
Wall time: 622 ms


In [10]:
%%time
for i in range(10000):
    slow_tokenizer(test)

CPU times: total: 641 ms
Wall time: 1.74 s


In [12]:
inputs_3 = fast_tokenizer(test, return_offsets_mapping=True)    # 仅用于fast
inputs_3

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [13]:
inputs_3.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

# 特殊的Tokenizer加载
此部分在使用第三方分词器时使用

In [14]:
special_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
# special_tokenizer.save_pretrained("local_path")
# special_tokenizer = AutoTokenizer.from_pretrained("local_path", trust_remote_code=True)

OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.