# tokenizer，构造输入

- tokenizer, model： 相匹配
- AutoTokenizer, AutoModel: Generic Type
- tokenizer[构造model's input]
  - len(input_ids) = len(attention_mask)
  - tokenizer.encode = tokenizer.tokenize + tokenizer.convert_tokens_to_ids + tokenizer.build_inputs_with_special_tokens
  - tokenizer.vocab 存储了token -> id 的映射
    - tokenizer.special_tokens_map
  - attention mask 与 padding相匹配

In [1]:
test_sentences = ['today is not that bad', 'today is so bad']
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
batch_input = tokenizer(test_sentences, max_length=12, truncation=True, padding='max_length', return_tensors='pt')

In [6]:
batch_input

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 2008, 2919,  102,    0,    0,    0,    0,    0],
        [ 101, 2651, 2003, 2061, 2919,  102,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [7]:
tokenizer(test_sentences[0])

{'input_ids': [101, 2651, 2003, 2025, 2008, 2919, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.tokenize(test_sentences[0]) # ['today', 'is', 'not', 'that', 'bad']
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentences[0])) # [2651, 2003, 2025, 2008, 2919]
tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentences[0]))) # [101, 2651, 2003, 2025, 2008, 2919, 102]
tokenizer.encode(test_sentences[0]) # [101, 2651, 2003, 2025, 2008, 2919, 102]
# tokenizer.encode = tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_sentences[0])))

[101, 2651, 2003, 2025, 2008, 2919, 102]

In [9]:
tokenizer.decode(tokenizer.encode(test_sentences[0])) # '[CLS] today is not that bad [SEP]'

'[CLS] today is not that bad [SEP]'

In [10]:
tokenizer.vocab

{'##drum': 21884,
 'ӏ': 1218,
 'cholera': 25916,
 'harassed': 28186,
 '##亻': 30283,
 'joanne': 23459,
 'nat': 14085,
 'saturdays': 18860,
 'oppressive': 28558,
 'yuri': 14331,
 'ron': 6902,
 '[unused664]': 669,
 '##flict': 29301,
 'gems': 20296,
 'arjun': 26024,
 'uganda': 10031,
 '7th': 5504,
 'blossom': 20593,
 '‒': 1515,
 'friedrich': 8896,
 '##ław': 19704,
 'byu': 23471,
 'interceptor': 24727,
 '##у': 29748,
 'averaged': 11398,
 'ultimately': 4821,
 'deadly': 9252,
 'exited': 15284,
 'spores': 23763,
 'federico': 20493,
 'populated': 10357,
 'injustice': 21321,
 'would': 2052,
 'burgess': 17754,
 'sweden': 4701,
 'browser': 16602,
 'horace': 12757,
 '##tf': 24475,
 'sorted': 19616,
 '##ores': 16610,
 '##چ': 29840,
 'libyan': 19232,
 'befriended': 23386,
 'lucille': 28016,
 '##御': 30373,
 'seventeenth': 15425,
 '##with': 24415,
 'highway': 3307,
 'coleman': 11608,
 'frederic': 15296,
 '##不': 30270,
 'strained': 12250,
 '430': 19540,
 '##ular': 7934,
 'swan': 10677,
 'alphabet': 1244

# model，调用模型

In [19]:
import torch
import torch.nn.functional as F
with torch.no_grad():
    output = model(**batch_input)
    print(output)
    scores = F.softmax(output.logits, dim=-1)
    print(scores)
    labels = torch.argmax(scores, dim=1)
    labels = [model.config.id2label[id] for id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.4620,  3.6118],
        [ 4.7508, -3.7899]]), hidden_states=None, attentions=None)
tensor([[8.4632e-04, 9.9915e-01],
        [9.9980e-01, 1.9531e-04]])
['POSITIVE', 'NEGATIVE']


In [18]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.30.2",
  "vocab_size": 30522
}