# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [10]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sqnc = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sqnc)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

#this will fail
model(input_ids)

IndexError: ignored

In [13]:
tokenized_inputs = tokenizer(sqnc, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs: ", input_ids)

output = model(input_ids)
print("Logits: ", output.logits)

Input IDs:  tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits:  tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [15]:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [16]:
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [22]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [26]:
sequence = sequence[:max_sequence_length]

NameError: ignored

In [32]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

s1 = "I’ve been waiting for a HuggingFace course my whole life."
s2 = "I hate this so much!"

tokens1 = tokenizer.tokenize(s1)
token_ids_1 = tokenizer.convert_tokens_to_ids(tokens1)

tokens2 = tokenizer.tokenize(s2)
token_ids_2 = tokenizer.convert_tokens_to_ids(tokens2)

# Pad either side respectively with the pad_token_id
batched_token_ids = [
    token_ids_1+[tokenizer.pad_token_id,]*max(0, (len(token_ids_2)-len(token_ids_1))),
    token_ids_2+[tokenizer.pad_token_id,]*max(0, (len(token_ids_1)-len(token_ids_2))),
]

token_attention_mask = [
    [1,]*len(token_ids_1)+[0,]*max(0, (len(token_ids_2)-len(token_ids_1))),
    [1,]*len(token_ids_2)+[0,]*max(0, (len(token_ids_1)-len(token_ids_2))),
]

In [33]:
print("\n\n\n... GENERAL INFO SEQUENCE 1 ...\n")
print(f"\n\tSEQUENCE ONE STRING\n\t\t--> '{s1}'")
print(f"\n\tSEQUENCE ONE TOKENS\n\t\t--> {tokens1}")
print(f"\n\tSEQUENCE ONE INPUT IDS\n\t\t--> {token_ids_1}\n")

print("\n\n... GENERAL INFO SEQUENCE 2 ...\n")
print(f"\n\tSEQUENCE TWO STRING\n\t\t--> '{s2}'")
print(f"\n\tSEQUENCE TWO TOKENS\n\t\t--> {tokens2}")
print(f"\n\tSEQUENCE TWO INPUT IDS\n\t\t--> {token_ids_2}\n")

print("\n\n... GENERAL INFO BATCHED SEQUENCES ...\n")
print(f"\n\tBATCHED SEQUENCES\n\t\t--> {[s1, s2]}")
print(f"\n\tBATCHED TOKENS\n\t\t--> {[tokens1, tokens2]}")
print(f"\n\tBATCHED & PADDED INPUT IDS\n\t\t--> {batched_token_ids}")
print(f"\n\tATTENTION MASK FOR MODEL INPUT\n\t\t--> {token_attention_mask}\n")




... GENERAL INFO SEQUENCE 1 ...


	SEQUENCE ONE STRING
		--> 'I’ve been waiting for a HuggingFace course my whole life.'

	SEQUENCE ONE TOKENS
		--> ['i', '’', 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']

	SEQUENCE ONE INPUT IDS
		--> [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]



... GENERAL INFO SEQUENCE 2 ...


	SEQUENCE TWO STRING
		--> 'I hate this so much!'

	SEQUENCE TWO TOKENS
		--> ['i', 'hate', 'this', 'so', 'much', '!']

	SEQUENCE TWO INPUT IDS
		--> [1045, 5223, 2023, 2061, 2172, 999]



... GENERAL INFO BATCHED SEQUENCES ...


	BATCHED SEQUENCES
		--> ['I’ve been waiting for a HuggingFace course my whole life.', 'I hate this so much!']

	BATCHED TOKENS
		--> [['i', '’', 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.'], ['i', 'hate', 'this', 'so', 'much', '!']]

	BATCHED & PADDED INPUT IDS
		--> [[1045, 1521, 2310, 2042, 3403, 2005

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
unbatched_output_1 = model(torch.tensor([token_ids_1])).logits
unbatched_output_2 = model(torch.tensor([token_ids_2])).logits

batched_output = model(torch.tensor(batched_token_ids), attention_mask=torch.tensor(token_attention_mask)).logits

print("\n\n... PYTORCH MODEL ...\n")
print(f"\n\tSEQUENCE ONE UNBATCHED - OUTPUT LOGITS\n\t\t--> {unbatched_output_1}")
print(f"\n\tSEQUENCE TWO UNBATCHED - OUTPUT LOGITS\n\t\t--> {unbatched_output_2}")
print(f"\n\tSEQUENCES BATCHED - OUTPUT LOGITS\n\t\t--> {batched_output}\n")



... PYTORCH MODEL ...


	SEQUENCE ONE UNBATCHED - OUTPUT LOGITS
		--> tensor([[-2.5720,  2.6852]], grad_fn=<AddmmBackward0>)

	SEQUENCE TWO UNBATCHED - OUTPUT LOGITS
		--> tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)

	SEQUENCES BATCHED - OUTPUT LOGITS
		--> tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)

