In [None]:
!pip install transformers



In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")   # “distilbert-base-uncased-finetuned-sst-2-english”

In [None]:
classifier('We are very happy to show you the 🤗 Transformers library.')

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [None]:
results = classifier(['We are very happy to show you the 🤗 Transformers library.', "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


In [None]:
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
classifier('We are very happy to show you the 🤗 Transformers library.')

[{'label': '5 stars', 'score': 0.7725350260734558}]

In [None]:
results = classifier(['We are very happy to show you the 🤗 Transformers library.', "I am so sad."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: 5 stars, with score: 0.7725
label: 1 star, with score: 0.4826


You can also replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model object and its associated tokenizer.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 直接 pipeline 掉
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
print(classifier('We are very happy to show you the 🤗 Transformers library.'))
results = classifier(['We are very happy to show you the 🤗 Transformers library.', "I am so sad."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

[{'label': '5 stars', 'score': 0.7725350260734558}]
label: 5 stars, with score: 0.7725
label: 1 star, with score: 0.4826


## Under the hood: pretrained models

### 1 Using the tokenizer
用模型的名字实例化一个 tokenizer，确保使用了和预训练时同样的规则。

分词 -> 根据 vocab 转换

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)  # define model
tokenizer = AutoTokenizer.from_pretrained(model_name)           # define tokenizer

In [None]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(inputs)

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


直接传递一个 句子 列表，将他们填充到一样的长度或删减它们到模型的最大长度

In [None]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    return_tensors="pt"
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
for k, v in pt_batch.items():
    print(f"{k}:{v.numpy().tolist()}")

input_ids:[[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
attention_mask:[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]


### Using the model

In [None]:
pt_outputs = pt_model(**pt_batch) #

In [None]:
# In 🤗 Transformers, all outputs are tuples (with only one element potentially). 
print(pt_outputs)

(tensor([[-4.0833,  4.3364],
        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)


In [None]:
# 过 softmax
import torch.nn.functional as F
pt_predictions = F.softmax(pt_outputs[0], dim=-1)
print(pt_predictions)

tensor([[2.2043e-04, 9.9978e-01],
        [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)


In [None]:
# If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
import torch
pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
print(pt_outputs)

(tensor(0.3167, grad_fn=<NllLossBackward>), tensor([[-4.0833,  4.3364],
        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>))


In [None]:
# https://huggingface.co/transformers/training.html  training tutorial
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

加载模型的另一种方式

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModel.from_pretrained(save_directory, from_tf=True)  # pytorch load tfmodel

NameError: ignored

In [None]:
# return all hidden states and all attention weights
pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
all_hidden_states, all_attentions = pt_outputs[-2:]

In [None]:
print(all_hidden_states)

## Accessing the code
This is how you would directly instantiate model and tokenizer without the auto magic:

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

### Customizing the model


In [None]:
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification(config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi