In [139]:
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# from transformers import pipeline

In [140]:
check_point = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(check_point)

In [141]:
raw_inputs = [
    'I have been waiting for a this course my whole life',
    ' I have this so much'
]
inputs = tokenizer(raw_inputs, return_tensors='pt',padding=True, truncation=True)
inputs

{'input_ids': tensor([[ 101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878, 2166,
          102],
        [ 101, 1045, 2031, 2023, 2061, 2172,  102,    0,    0,    0,    0,    0,
            0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [142]:
# tokenizer.convert_tokens_to_ids(inputs['input_ids'])

In [143]:
inputs['input_ids'].cpu().numpy()

array([[ 101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878,
        2166,  102],
       [ 101, 1045, 2031, 2023, 2061, 2172,  102,    0,    0,    0,    0,
           0,    0]], dtype=int64)

In [144]:
from transformers import AutoModel
check_point = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModel.from_pretrained(check_point)

In [145]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [146]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 13, 768])


In [147]:
from transformers import AutoModelForSequenceClassification
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.0877,  2.1051],
        [-3.8907,  4.1575]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [148]:
outputs.logits

tensor([[-2.0877,  2.1051],
        [-3.8907,  4.1575]], grad_fn=<AddmmBackward0>)

In [149]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [150]:
import torch
import torch.nn as nn
import torch.nn.functional as F
predictions = F.softmax(outputs.logits, dim=-1)
predictions

tensor([[1.4880e-02, 9.8512e-01],
        [3.1956e-04, 9.9968e-01]], grad_fn=<SoftmaxBackward0>)

In [151]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [152]:
sequence_1_ids = [[200,200,200]]
sequence_2_ids = [[200,200]]
batch_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id],
]

In [153]:
model(torch.tensor(sequence_1_ids)).logits

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)

In [154]:
model(torch.tensor(sequence_2_ids)).logits

tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

In [155]:
attention_mask = [
    [1,1,1],
    [1,1,0],
]
outputs = model(torch.tensor(batch_ids), attention_mask=torch.tensor(attention_mask))
outputs.logits

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

In [156]:
sequences = ['I have been waiting for a this course my whole life','So have I!','I have played basketball yesterday.']

In [157]:
model_inputs = tokenizer(sequences,padding='longest')
model_inputs

{'input_ids': [[101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878, 2166, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2031, 2209, 3455, 7483, 1012, 102, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

In [158]:
model_inputs = tokenizer(sequences,padding='max_length')
model_inputs

{'input_ids': [[101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878, 2166, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [159]:
model_inputs = tokenizer(sequences,padding='max_length',max_length=8)
model_inputs

{'input_ids': [[101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878, 2166, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0], [101, 1045, 2031, 2209, 3455, 7483, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [160]:
model_inputs = tokenizer(sequences,max_length=10,truncation=True)
model_inputs

{'input_ids': [[101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 102], [101, 2061, 2031, 1045, 999, 102], [101, 1045, 2031, 2209, 3455, 7483, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [161]:
model_inputs = tokenizer(sequences,padding=True,return_tensors='pt')
model_inputs

{'input_ids': tensor([[ 101, 1045, 2031, 2042, 3403, 2005, 1037, 2023, 2607, 2026, 2878, 2166,
          102],
        [ 101, 2061, 2031, 1045,  999,  102,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 1045, 2031, 2209, 3455, 7483, 1012,  102,    0,    0,    0,    0,
            0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

## 模型的加载与保存


In [162]:
import warnings
warnings.filterwarnings("ignore")
from transformers import BertConfig, BertModel

In [163]:
config = BertConfig()
model = BertModel(config)

In [164]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [165]:
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-cased')

In [166]:
# model.save_pretrained('bert-base-cased')

In [167]:
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset
raw_datasets = load_dataset('glue','mrpc')
raw_datasets

KeyboardInterrupt: 

In [None]:
raw_datasets

In [None]:
raw_train_dataset = raw_datasets['train']
raw_train_dataset[100]

In [None]:
raw_train_dataset.features

In [None]:
from transformers import AutoTokenizer
checkpoint ='bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
inputs = tokenizer('This is the first sentence.','This is the second one.')
inputs

In [None]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'],  truncation=True)

In [None]:
raw_datasets

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets['train'][0]

In [None]:
smples = tokenized_datasets['train'][:8]
# tokenized_datasets['train']
smples

In [None]:
tokenized_datasets['train'][:6] == tokenized_datasets['train'][:8]

In [None]:
samples={k:v for k,v in smples.items() if k not in ['idx','sentence1','sentence2']}
samples

In [None]:
[len(x) for x in smples['input_ids']]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding=True)

In [None]:
batch =data_collator(smples)
{k:v.shape for k,v in batch.items()}

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments('test-trainer')

In [None]:
training_args

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

In [None]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

In [None]:
torch.cuda.is_available()

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])
predictions.predictions.shape,predictions.label_ids.shape

In [None]:
torch.__version__