# Tokenizers (PyTorch)

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.save_pretrained("directory_on_my_computer")

('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [None]:
decoded_str = tokenizer.decode([7993, 170, 13809, 23763, 2443, 1110, 3014])
print(decoded_str)

Using a Transformer network is simple


From Video

In [19]:
from transformers import AutoTokenizer

In [20]:
txt = "Let's try to tokenize!"

In [22]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(txt )
print(inputs["input_ids"])

[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]


In [23]:
tokens = tokenizer.tokenize(txt )
print(tokens)

['let', "'", 's', 'try', 'to', 'token', '##ize', '!']


In [25]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
tokens = tokenizer.tokenize(txt)
print(tokens)

['▁let', "'", 's', '▁try', '▁to', '▁to', 'ken', 'ize', '!']


In [27]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(txt)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[2292, 1005, 1055, 3046, 2000, 19204, 4697, 999]


In [29]:
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs)

{'input_ids': [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [30]:
print(tokenizer.decode(inputs["input_ids"]))

[CLS] let's try to tokenize! [SEP]
