In [1]:
from transformers import AutoTokenizer

In [30]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [31]:
# sequences = ["this is first sentence", "this is second sentence"]

sequences = ["cosentyx", "this is second sentence"]


In [32]:
tokens = tokenizer.tokenize(sequences)
tokens

['co', '##sen', '##ty', '##x', 'this', 'is', 'second', 'sentence']

In [33]:
#From tokens to input IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2522, 5054, 3723, 2595, 2023, 2003, 2117, 6251]

Decoding

In [34]:
#This can be done with the decode() method
decoded_string = tokenizer.decode(ids)
decoded_string

'cosentyx this is second sentence'

Handling multiple sequences

In [43]:
#Transformers models expect multiple sentences by default. 
sentences = [ 'this is first sentence','this is second sentence','this is third sentence','but this is the largest sentence of all']
print(sentences)


['this is first sentence', 'this is second sentence', 'this is third sentence', 'but this is the largest sentence of all']


In [53]:
tokens = [tokenizer.tokenize(sentence,padding="max_length",max_length=10) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

In [54]:
tokens

[['this',
  'is',
  'first',
  'sentence',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['this',
  'is',
  'second',
  'sentence',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['this',
  'is',
  'third',
  'sentence',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['but',
  'this',
  'is',
  'the',
  'largest',
  'sentence',
  'of',
  'all',
  '[PAD]',
  '[PAD]']]

In [55]:
ids

[[2023, 2003, 2034, 6251, 0, 0, 0, 0, 0, 0],
 [2023, 2003, 2117, 6251, 0, 0, 0, 0, 0, 0],
 [2023, 2003, 2353, 6251, 0, 0, 0, 0, 0, 0],
 [2021, 2023, 2003, 1996, 2922, 6251, 1997, 2035, 0, 0]]

In [56]:
import torch
input_ids = torch.tensor([ids])


In [57]:
input_ids

tensor([[[2023, 2003, 2034, 6251,    0,    0,    0,    0,    0,    0],
         [2023, 2003, 2117, 6251,    0,    0,    0,    0,    0,    0],
         [2023, 2003, 2353, 6251,    0,    0,    0,    0,    0,    0],
         [2021, 2023, 2003, 1996, 2922, 6251, 1997, 2035,    0,    0]]])

Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).