## Handling multiple sequences

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
# sequence into list of numbers.
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "God is good God is great."
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
print(ids)

[2643, 2003, 2204, 2643, 2003, 2307, 1012]


In [5]:
# list of number to a tensor.
input_ids = torch.tensor(ids)

In [6]:
model(input_ids)

IndexError: ignored

### This is because the model expect multiple sentences by default

In [8]:
# adding a new dimenstion.
input_ids = torch.tensor([ids])
print("Input IDS:",input_ids)

output = model(input_ids)
print("logits:", output.logits)

Input IDS: tensor([[2643, 2003, 2204, 2643, 2003, 2307, 1012]])
logits: tensor([[-2.3336,  2.4200]], grad_fn=<AddmmBackward0>)


In [9]:
batched_ids = [ids, ids]

In [10]:
input_ids1 = torch.tensor(batched_ids)
print("Input IDS:",input_ids1)

output = model(input_ids1)
print("logits:", output.logits)

Input IDS: tensor([[2643, 2003, 2204, 2643, 2003, 2307, 1012],
        [2643, 2003, 2204, 2643, 2003, 2307, 1012]])
logits: tensor([[-2.3336,  2.4200],
        [-2.3336,  2.4200]], grad_fn=<AddmmBackward0>)


In [11]:
# padding.
sent1_ids = [[200,200,200]]
sent2_ids = [[200,200]]
batched = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

In [12]:
print(model(torch.tensor(sent1_ids)).logits)
print(model(torch.tensor(sent2_ids)).logits)
print(model(torch.tensor(batched)).logits)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [13]:
# We can see we have different logits for same inputs. It is because of attention layer.

In [14]:
# attention mask to get same result.
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [16]:
# try it out 2
seq_1 = "I’ve been waiting for a HuggingFace course my whole life."
seq_2 = "I hate this so much!"
tokens_1 = tokenizer.tokenize(seq_1)
tokens_2 = tokenizer.tokenize(seq_2)
ids1 = tokenizer.convert_tokens_to_ids(tokens_1)
ids2 = tokenizer.convert_tokens_to_ids(tokens_2)

In [20]:
len(ids1)

14

In [27]:
batched1 = [
    [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 2061, 2172, 999, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id],
]

In [28]:
attention_mask = [
    [1, 1, 1,1,1,1,1,1,1,1,1,1,1,1],
    [1, 1, 1,1,1,1,0,0,0,0,0,0,0,0],
]

outputs = model(torch.tensor(batched1), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
