<a href="https://colab.research.google.com/github/SSRavipati/LLM-course/blob/main/chapter_2/Handling_Multiple_sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Handling multiple sequences**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "first gear low speed"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

In the above code we can see the code failed as the transoformer was expecting multiple sequences

In [None]:
inputs = tokenizer(sequence, padding = True, truncation = True, return_tensors = "pt")
print(inputs)
model(**inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

The code had no errors in this case, we can gather that tokenizer adds a dimension along with tokinizing and encoding

Let’s try again and add a new dimension:


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "first gear is low speed"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
# here we are creating a 2D  tensor
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[2034, 6718, 2003, 2659, 3177]])
Logits: tensor([[-0.6330,  0.6272]], grad_fn=<AddmmBackward0>)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sentences = [
    "I’ve been waiting for this my whole life.",
    "I hate this so much!"
]

# Manually tokenize and convert to IDs for each sentence
ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)) for sentence in sentences]

# Find the maximum sequence length in the batch
max_len = max(len(sublist) for sublist in ids)

# Get the padding token ID
pad_token_id = tokenizer.pad_token_id
if pad_token_id is None:

    pad_token_id = 0
    print(f"Warning: Tokenizer does not have a defined pad_token_id. Using {pad_token_id} for padding.")


# Manually pad the sequences and create the attention mask
padded_ids = []
attention_masks = []

for sublist in ids:
    # Calculate how many padding tokens are needed
    padding_length = max_len - len(sublist)

    # Pad the sequence
    padded_sequence = sublist + [pad_token_id] * padding_length
    padded_ids.append(padded_sequence)

    # Create the attention mask: 1 for original tokens, 0 for padding tokens
    attention_mask = [1] * len(sublist) + [0] * padding_length
    attention_masks.append(attention_mask)

# Convert the padded sequences and attention masks to PyTorch tensors
input_ids = torch.tensor(padded_ids)
attention_mask = torch.tensor(attention_masks)

print("Input IDs (padded):", input_ids)
print("Attention Mask:", attention_mask)

# Pass the tensors through the model
# The model expects input_ids and attention_mask as keyword arguments
output = model(input_ids=input_ids, attention_mask=attention_mask)

print("Logits:", output.logits)

Input IDs (padded): tensor([[1045, 1521, 2310, 2042, 3403, 2005, 2023, 2026, 2878, 2166, 1012],
        [1045, 5223, 2023, 2061, 2172,  999,    0,    0,    0,    0,    0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
Logits: tensor([[-3.2906,  3.4655],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
