## Tokenizer 构造输入

- 调用模型: distilbert-base-uncased-finetuned-sst-2-english
- link: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english


**Tokenizer需要和Model配合使用：一定要相匹配**

- Tokenizer outputs => model input

- Auto\*Tokenizer  & Auto\*Model: Generic type


- tokenizer: 完全服务于model input
    - len(input_ids) == len(attention_mask)
    - tokenizer(test_sentence): 实际上实在内部调用tokenizer.\_\_call\_\_: encode
    - tokenizer.encode == tokenizer.tokenize + tokenizer.convert_tokens_to_ids
    - tokenizer.decode
    - tokenizer 的工作原理: tokenizer.vocab: 字典 token => ids 的映射关系
        - tokenizer.special_token_maps
    - attention_mask == 1 不是padding的部分 == 0 是padding的部分

In [51]:
test_sentence = ['today is not that bad', 'today is so bad', 'I don\'t know how to do it!']
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [52]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [53]:
# 导入tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [54]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [55]:
batch_input = tokenizer(test_sentence, truncation=True, padding=True, return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102,    0,    0,    0,    0,    0],
        [ 101, 2651, 2003, 2061, 2919,  102,    0,    0,    0,    0,    0,    0],
        [ 101, 1045, 2123, 1005, 1056, 2113, 2129, 2000, 2079, 2009,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [56]:
tokenizer(test_sentence[2])

{'input_ids': [101, 1045, 2123, 1005, 1056, 2113, 2129, 2000, 2079, 2009, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [57]:
tokenizer.encode(test_sentence[2])   # 102是一个结束词 101是起始词

[101, 1045, 2123, 1005, 1056, 2113, 2129, 2000, 2079, 2009, 999, 102]

In [58]:
tokenizer.tokenize(test_sentence[2])

['i', 'don', "'", 't', 'know', 'how', 'to', 'do', 'it', '!']

In [59]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentence[0])) #[2651, 2003, 7293, 2008, 2919] 与encode对应一致

[2651, 2003, 2025, 2008, 2919]

In [60]:
tokenizer.decode([101, 2651, 2003, 7293, 2008, 2919, 102])  # [CLS] [SEP] Bert经典分词方式

'[CLS] today is nod that bad [SEP]'

In [61]:
tokenizer.vocab_size

30522

In [62]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [63]:
tokenizer.convert_tokens_to_ids(sprcial_token for sprcial_token in tokenizer.special_tokens_map.values())

[100, 102, 0, 101, 103]

In [64]:
# max_length 确定序列的最大长度
# truncation 不符合最大长度进行剪切
# padding 补充padding补齐
# return_tensors 使用pytorch

# 如果padding等于True的情况下会忽略掉max_length, 同理，如果padding='max_length', 那么序列最大长度都会padidng补齐, 0
batch_input = tokenizer(test_sentence, max_length=256, truncation=True, padding=True, return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102,    0,    0,    0,    0,    0],
        [ 101, 2651, 2003, 2061, 2919,  102,    0,    0,    0,    0,    0,    0],
        [ 101, 1045, 2123, 1005, 1056, 2113, 2129, 2000, 2079, 2009,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [65]:
batch_input = tokenizer(test_sentence, max_length=32, truncation=True, padding='max_length', return_tensors='pt')
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 2651, 2003, 2061, 2919,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 1045, 2123, 1005, 1056, 2113, 2129, 2000, 2079, 2009,  999,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Model 模型调用

In [66]:
import torch
import torch.nn.functional as F

In [71]:
# 取消梯度
# scores是经过softmax之后的情绪分类的归一划参数，然后使用argmax函数取一行中的最大值的索引，代表正向或者反向
with torch.no_grad():
    outputs = model(**batch_input)
    print(outputs)
    scores = F.softmax(outputs.logits, dim = 1)
    print(scores)
    labels = torch.argmax(scores, dim = 1)
    print(labels)
    label = [model.config.id2label[sentiment] for sentiment in labels.tolist()]
    print(label)  # ['POSITIVE', 'NEGATIVE', 'NEGATIVE']
    
# ['today is not that bad', 'today is so bad', 'I don\'t know how to do it!'] pos neg neg

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899],
        [ 4.4198, -3.5049]]), hidden_states=None, attentions=None)
tensor([[8.4632e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04],
        [9.9964e-01, 3.6156e-04]])
tensor([1, 0, 0])
['POSITIVE', 'NEGATIVE', 'NEGATIVE']


In [68]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}