In [36]:
from transformers import AutoTokenizer, BertModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, I'm a single sentence!")
print(encoded_input)

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [37]:
tokenizer.decode(encoded_input["input_ids"])

"[CLS] Hello, I ' m a single sentence! [SEP]"

In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
encoded_input = tokenizer("How are you?", "I'm fine, thank you!", return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,  146,  112,  182, 2503,  117, 6243,
         1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


**Padding inputs**

If we ask the tokenizer to pad the inputs, it will make all sentences the same length by adding a special padding token to the sentences that are shorter than the longest one:

In [14]:
encoded_input = tokenizer(["How are you?", "I'm fine, thank you!"],padding=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  136,  102,    0,    0,    0,    0],
        [ 101,  146,  112,  182, 2503,  117, 6243, 1128,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


The tensors might get too big to be processed by the model. For instance, BERT was only pretrained with sequences up to 512 tokens, so it cannot process longer sequences. If you have sequences longer than the model can handle, you’ll need to truncate them with the truncation parameter:

In [18]:
encoded_input = tokenizer(
    "This is a very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very very long sentence.",
    truncation=True,
)
print(encoded_input["input_ids"])

[101, 1188, 1110, 170, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1304, 1263, 5650, 119, 102]


By combining the padding and truncation arguments, you can make sure your tensors have the exact size you need:

In [21]:
encoded_input = tokenizer(
    ["How are you?", "I'm fine, thank you!"],
    padding=True,
    truncation=True,
    max_length=5,
    return_tensors="pt",
)
print(encoded_input)

{'input_ids': tensor([[ 101, 1731, 1132, 1128,  102],
        [ 101,  146,  112,  182,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}


In [22]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

In [25]:
encoded_sequences = [
    [
        101,
        1045,
        1005,
        2310,
        2042,
        3403,
        2005,
        1037
    ],
    [101, 1045, 5223, 2023, 2061, 2172, 999, 102],
]

In [35]:
import torch

model_inputs = torch.tensor(encoded_sequences)
output = model(model_inputs)
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0540,  0.0534,  0.1497,  ..., -0.4506,  0.4883, -0.0358],
         [-0.2622, -0.1387,  0.6599,  ..., -0.3344,  0.6585,  0.0504],
         [-0.1086, -0.3388,  0.6278,  ..., -0.3018,  0.6292,  0.0722],
         ...,
         [-0.2101,  0.2066,  0.2913,  ..., -0.4580,  0.1699,  0.2519],
         [-0.2638,  0.1462,  0.4753,  ..., -0.7680,  0.3537,  0.0579],
         [-0.2006, -0.2844,  0.9313,  ..., -0.5717,  0.7596,  0.3495]],

        [[-0.0725,  0.0540, -0.0037,  ...,  0.1450,  0.2381, -0.0164],
         [-0.1619, -0.3062, -0.2282,  ...,  0.3782, -0.1170,  0.1295],
         [-0.1173, -0.1003,  0.1703,  ...,  0.3227, -0.1996,  0.1646],
         ...,
         [-0.1858,  0.0967,  0.4599,  ...,  0.3619,  0.1154,  0.1746],
         [ 0.0969, -0.2961,  0.0700,  ..., -0.0877, -0.1499,  0.1670],
         [ 0.2359, -0.3709, -0.1987,  ...,  0.6541,  0.7405, -0.4106]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou