# Error Handling for Sequence of Different Length:

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
input = [
    'Virat Kohli is the best cricketer in the world',
    'Rohit sharma has the highest individual score in ODI format cricket',
    ]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokens=[tokenizer.tokenize(seq) for seq in input]
ids = [tokenizer.convert_tokens_to_ids(token)  for token in tokens]
input_ids = torch.tensor(ids)
print('token_ids:',tokens)
print('ids:',ids)
print('input_ids:',input_ids)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ValueError: expected sequence of length 11 at dim 1 (got 12)

# Handling the above error:
## By Padding:

In [None]:
print('id_1:',ids[0])
print('id_2',ids[1])
print('ids:',ids)

id_1: [6819, 8609, 12849, 27766, 2003, 1996, 2190, 9490, 1999, 1996, 2088]
id_2 [20996, 16584, 14654, 2038, 1996, 3284, 3265, 3556, 1999, 21045, 4289, 4533]
ids: [[6819, 8609, 12849, 27766, 2003, 1996, 2190, 9490, 1999, 1996, 2088], [20996, 16584, 14654, 2038, 1996, 3284, 3265, 3556, 1999, 21045, 4289, 4533]]


In [None]:
id_1 = torch.tensor([[6819, 8609, 12849, 27766, 2003, 1996, 2190, 9490, 1999, 1996, 2088]])

id_2 = torch.tensor([[20996, 16584, 14654, 2038, 1996, 3284, 3265, 3556, 1999, 21045, 4289, 4533]])

ids = torch.tensor([[6819, 8609, 12849, 27766, 2003, 1996, 2190, 9490, 1999, 1996, 2088,tokenizer.pad_token_id],
                    [20996, 16584, 14654, 2038, 1996, 3284, 3265, 3556, 1999, 21045, 4289, 4533]])

print('id_1:',id_1)
print('id_2',id_2)
print('batch_ids:',ids)

id_1: tensor([[ 6819,  8609, 12849, 27766,  2003,  1996,  2190,  9490,  1999,  1996,
          2088]])
id_2 tensor([[20996, 16584, 14654,  2038,  1996,  3284,  3265,  3556,  1999, 21045,
          4289,  4533]])
batch_ids: tensor([[ 6819,  8609, 12849, 27766,  2003,  1996,  2190,  9490,  1999,  1996,
          2088,     0],
        [20996, 16584, 14654,  2038,  1996,  3284,  3265,  3556,  1999, 21045,
          4289,  4533]])


In [None]:
print(model(id_1).logits)
print(model(id_2).logits)
print(model(ids).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[-3.9132,  4.3216]], grad_fn=<AddmmBackward0>)
tensor([[-2.4203,  3.0150]], grad_fn=<AddmmBackward0>)
tensor([[-3.8709,  4.2965],
        [-2.4203,  3.0150]], grad_fn=<AddmmBackward0>)


we can see from above that the logit of id where we have added the padding is not same as logit of individual id.

There’s something wrong with the logits in our batched predictions: the first row should be the same as the logits for the second sentence, but we’ve got completely different values!

This is because the key feature of Transformer models is attention layers that contextualize each token. These will take into account the padding tokens since they attend to all of the tokens of a sequence. To get the same result when passing individual sentences of different lengths through the model or when passing a batch with the same sentences and padding applied, we need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.

## By Attention Masking:

In [None]:
attention_mask = torch.tensor([
    [1,1,1,1,1,1,1,1,1,1,1,0],
    [1,1,1,1,1,1,1,1,1,1,1,1]])
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

print('Logits:',model(ids,attention_mask=attention_mask).logits)

Logits: tensor([[-3.9132,  4.3216],
        [-2.4203,  3.0150]], grad_fn=<AddmmBackward0>)


we get the same logits for the first sentence in the batch.

Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).

# Longer Sequences:
With Transformer models, there is a limit to the lengths of the sequences we can pass the models. Most models handle sequences of up to 512 or 1024 tokens, and will crash when asked to process longer sequences. There are two solutions to this problem:

Use a model with a longer supported sequence length.
Truncate your sequences.

# Putting All Together:

## 1.Trying Padding and Truncation:

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

input= ['Hi my name is Pavan sai',
        'i love virat kohli and he is my inspiration']

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
#Here we are trying padding with logest length of sentence
input_ids = tokenizer(input, padding='longest',return_tensors='pt')
output_ids = model(**input_ids)

print(output_ids.logits)

tensor([[-2.4683,  2.7601],
        [-4.1838,  4.5280]], grad_fn=<AddmmBackward0>)


In [None]:
#Here we are trying padding with the max length of model
input_ids= tokenizer(input, padding='max_length', return_tensors='pt')
output_ids = model(**input_ids)
print(output_ids.logits)

tensor([[-2.4683,  2.7601],
        [-4.1838,  4.5280]], grad_fn=<AddmmBackward0>)


In [None]:
#Here we are trying padding with the max length but we have also specified max length and truncation
# Will truncate the sequences that are longer than the model max length
input_ids= tokenizer(input, padding='max_length', max_length =4, truncation =True, return_tensors='pt')
output_ids = model(**input_ids)
print(output_ids.logits)

tensor([[-3.2258,  3.4330],
        [-4.3595,  4.7084]], grad_fn=<AddmmBackward0>)


## 2.Trying various tensor formats:

In [None]:
#Here tf indicates tensor flow tensors
input_ids= tokenizer(input, padding= True, return_tensors='tf')
print(input_ids)

{'input_ids': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[  101,  7632,  2026,  2171,  2003,  6643,  6212, 18952,   102,
            0,     0,     0,     0],
       [  101,  1045,  2293,  6819,  8609, 12849, 27766,  1998,  2002,
         2003,  2026,  7780,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [None]:
#Here tf indicates pytorch tensors
input_ids= tokenizer(input, padding= True, return_tensors='pt')
print(input_ids)

{'input_ids': tensor([[  101,  7632,  2026,  2171,  2003,  6643,  6212, 18952,   102,     0,
             0,     0,     0],
        [  101,  1045,  2293,  6819,  8609, 12849, 27766,  1998,  2002,  2003,
          2026,  7780,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
#Here tf indicates numpy tensors
input_ids= tokenizer(input, padding= True, return_tensors='np')
print(input_ids)

{'input_ids': array([[  101,  7632,  2026,  2171,  2003,  6643,  6212, 18952,   102,
            0,     0,     0,     0],
       [  101,  1045,  2293,  6819,  8609, 12849, 27766,  1998,  2002,
         2003,  2026,  7780,   102]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


## 3.Special Tokens:

In [None]:
# Tokenizing multiple sentences
model_inputs = tokenizer(input, padding = True, truncation=True, return_tensors = 'pt')
# Decode the batch of tokenized sentences
print('model_inputs:\n', tokenizer.batch_decode(model_inputs['input_ids']))

# Tokenizing manually and converting to IDs
tokens = [tokenizer.tokenize(token) for token in input]
print('\n token_id:',tokens)

input_ids = [tokenizer.convert_tokens_to_ids(id) for id in tokens]
print('\ninput_ids:',input_ids)

print('\ndecoded text:',[tokenizer.decode(id) for id in input_ids])

model_inputs:
 ['[CLS] hi my name is pavan sai [SEP] [PAD] [PAD] [PAD] [PAD]', '[CLS] i love virat kohli and he is my inspiration [SEP]']

 token_id: [['hi', 'my', 'name', 'is', 'pa', '##van', 'sai'], ['i', 'love', 'vi', '##rat', 'ko', '##hli', 'and', 'he', 'is', 'my', 'inspiration']]

input_ids: [[7632, 2026, 2171, 2003, 6643, 6212, 18952], [1045, 2293, 6819, 8609, 12849, 27766, 1998, 2002, 2003, 2026, 7780]]

decoded text: ['hi my name is pavan sai', 'i love virat kohli and he is my inspiration']
