## 加载模型 model

In [2]:
from transformers import BertTokenizer

model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)

In [4]:
tokenizer  # 语料大小 30522 右侧padding 

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [8]:
special_tokens_list = list(tokenizer.special_tokens_map.values())
tokenizer.convert_tokens_to_ids(special_tokens_list)

[100, 102, 0, 101, 103]

In [10]:
tokenizer.encode(special_tokens_list)  # 开头加了101 结尾加了102

[101, 100, 102, 0, 101, 103, 102]

## 认识文本语料


#### fetch_20newsgroups

20 newsgroups数据集18000多篇新闻文章，一共涉及到20种话题，所以称作20newsgroups text dataset，分为两部分：训练集和测试集，通常用来做文本分类，均匀分为20个不同主题的新闻组集合。20newsgroups数据集是被用于文本分类、文本挖据和信息检索研究的国际标准数据集之一。一些新闻组的主题特别相似(e.g. comp.sys.ibm.pc.hardware/ comp.sys.mac.hardware)，还有一些却完全不相关 (e.g misc.forsale /soc.religion.christian)。


- newsgroups_train.DESCR  关于dataset的基本介绍
- newsgroups_train.data  type list length 11314 
- newsgroups_train.target 分类  length 11314
- newsgroups_train.target_names

In [11]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [15]:
from collections import Counter

In [18]:
Counter(newsgroups_train.target)  # 一共是有19中分类，每一个分类对应的数量大概是600左右

Counter({7: 594,
         4: 578,
         1: 584,
         14: 593,
         16: 546,
         13: 594,
         3: 590,
         2: 591,
         8: 598,
         19: 377,
         6: 585,
         0: 480,
         12: 591,
         5: 593,
         10: 600,
         9: 597,
         15: 599,
         17: 564,
         18: 465,
         11: 595})

In [19]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Tokenizer 补充

- input_ids, attention_mask
- encode_plus, token_type_ids
    - 有些NLP任务需要将两个句子拼接在一起，比如序列标注/分类和问答。例如问答时，需要第一个作为上下文，第二个句子作为问题，要求模型输出答案。这时tokenizer接受两个句子的顺序输入并输出数字编码。虽然返回的数字编码中也包含了句子的分隔信息，Tokenizer的输出仍然提供可选的第3个常用字段"token_type_ids"。它用来表明返回的数字编码中哪些属于第一个句子，哪些属于第二个句子。
    - 句子对一般是用在nsp任务中 next sentence predict, bert的预训练任务之一

In [22]:
# 筛选出来前三个数据进行demo
test_news = newsgroups_train.data[:3]
test_news

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [23]:
[len(i) for i in test_news]  # 每一个demo新闻的长度

[721, 858, 1981]

In [27]:
# single sentence
tokenizer(test_news, truncation=True, max_length=32)

{'input_ids': [[101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 102], [101, 2013, 1024, 3124, 5283, 2080, 1030, 9806, 1012, 1057, 1012, 2899, 1012, 3968, 2226, 1006, 3124, 13970, 2080, 1007, 3395, 1024, 9033, 5119, 8554, 1011, 2345, 2655, 12654, 1024, 2345, 102], [101, 2013, 1024, 1056, 29602, 6856, 1030, 14925, 1012, 14925, 2078, 1012, 19749, 1012, 3968, 2226, 1006, 2726, 1041, 12688, 1007, 3395, 1024, 1052, 2497, 3980, 1012, 1012, 1012, 3029, 1024, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [31]:
# 如果涉及到句子拼接,那么token_type_ids就不全为0，第一个句子是0，第二个句子是1

encodings_ab = tokenizer.encode_plus(text=test_news[0], text_pair=test_news[1])  # 目前只能是两个句子的拼接
print("Encoded sequence(AB):", encodings_ab["input_ids"])

decoded_ab = tokenizer.decode(encodings_ab["input_ids"])
print("Decoded sequence(AB):", decoded_ab)
print("Token type ids(AB):", encodings_ab["token_type_ids"])

Encoded sequence(AB): [101, 2013, 1024, 3393, 2099, 2595, 3367, 1030, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 1006, 2073, 1005, 1055, 2026, 2518, 1007, 3395, 1024, 2054, 2482, 2003, 2023, 999, 1029, 1050, 3372, 2361, 1011, 14739, 1011, 3677, 1024, 10958, 2278, 2509, 1012, 11333, 2213, 1012, 8529, 2094, 1012, 3968, 2226, 3029, 1024, 2118, 1997, 5374, 1010, 2267, 2380, 3210, 1024, 2321, 1045, 2001, 6603, 2065, 3087, 2041, 2045, 2071, 4372, 7138, 2368, 2033, 2006, 2023, 2482, 1045, 2387, 1996, 2060, 2154, 1012, 2009, 2001, 1037, 1016, 1011, 2341, 2998, 2482, 1010, 2246, 2000, 2022, 2013, 1996, 2397, 20341, 1013, 2220, 17549, 1012, 2009, 2001, 2170, 1037, 5318, 4115, 1012, 1996, 4303, 2020, 2428, 2235, 1012, 1999, 2804, 1010, 1996, 2392, 21519, 2001, 3584, 2013, 1996, 2717, 1997, 1996, 2303, 1012, 2023, 2003, 2035, 1045, 2113, 1012, 2065, 3087, 2064, 2425, 4168, 1037, 2944, 2171, 1010, 3194, 28699, 2015, 1010, 2086, 1997, 2537, 1010, 2073, 2023, 2482, 2003, 2081, 1010, 2381, 1010,