In [1]:
import torch
from torch.nn import functional
import transformers
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

from utils import jupyter_formatting

jupyter_formatting.setup_notebook_formatting()

MODEL_STR = "distilbert-base-uncased-finetuned-sst-2-english"

# Basic pipelining

In [None]:
# Create tokenizer first with the parameter
tokenizer = AutoTokenizer.from_pretrained(MODEL_STR, clean_up_tokenization_spaces=True)

In [None]:
classifier = transformers.pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer=tokenizer,
)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

In [14]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [31]:
model = AutoModel.from_pretrained(MODEL_STR)

In [32]:
outputs = model(**inputs)
outputs.last_hidden_state.shape

torch.Size([2, 16, 768])

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_STR)
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
predictions = functional.softmax(outputs.logits, dim=-1)
predictions

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)

In [10]:
model_config = model.config
model_config.id2label

{
    "0": "NEGATIVE",
    "1": "POSITIVE"
}

## Batching and padding

In [12]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [16]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# model(input_ids)  <- this would fail:
# `IndexError: too many indices for tensor of dimension 1`

In [18]:
input_ids = torch.tensor([ids])
input_ids

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])

In [20]:
output = model(input_ids)
output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [21]:
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
batched_ids

[
    [
        200,
        200,
        200
    ],
    [
        200,
        200,
        0
    ]
]

In [25]:
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [29]:
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))  # correct
outputs.logits

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)