In [1]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir="./cache")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
val = tokenizer.encode(["Pragateesh", "IronMan"], return_tensors="pt") # pt is for PyTorch, for TensorFlow use "tf"
val, val[0]

(tensor([[  101, 10975, 16098, 28313,  2232,   102,  3707,  2386,   102]]),
 tensor([  101, 10975, 16098, 28313,  2232,   102,  3707,  2386,   102]))

In [4]:
val2 =tokenizer.decode(val[0])
val2

'[CLS] pragateesh [SEP] ironman [SEP]'

In [None]:
tokenizer.add_special_tokens({"additional_special_tokens": ["pragateesh1", "pragateesh2"]})
# The attribute `extra_special_tokens` does not exist in the HuggingFace tokenizer API.
# To access the additional special tokens, use:
print(tokenizer.additional_special_tokens)

['pragateesh1', 'pragateesh2']


In [7]:
tokens = tokenizer.tokenize("Hello, Pragateesh!")
print(tokens)
# Breaks input into subwords or words.

['hello', ',', 'pr', '##aga', '##tees', '##h', '!']


In [7]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
#  Maps tokens to numerical IDs.

[7592, 1010, 10975, 16098, 28313, 2232, 999]


In [8]:
tokens = tokenizer.convert_ids_to_tokens(ids)
print(tokens)
# Maps numerical IDs to tokens.

['hello', ',', 'pr', '##aga', '##tees', '##h', '!']


In [9]:
text = tokenizer.decode(ids)
print(text)
# Converts IDs to text.


hello, pragateesh!


In [10]:
print(tokenizer.special_tokens_map)
# Special tokens map.

{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['pragateesh1', 'pragateesh2']}


In [11]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<pragateesh1>", "<pragateesh2>"]})
print(tokenizer.additional_special_tokens)
# Adds special tokens to the tokenizer.

['<pragateesh1>', '<pragateesh2>']


In [12]:
tokenizer.special_tokens_map['additional_special_tokens'] = ['<pragateesh3>', '<pragateesh4>']
print(tokenizer.special_tokens_map)

{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<pragateesh1>', '<pragateesh2>']}


In [13]:
print(tokenizer.all_special_tokens) # All special tokens.
print(tokenizer.all_special_ids) # All special IDs.


['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]', '<pragateesh1>', '<pragateesh2>']
[100, 102, 0, 101, 103, 30524, 30525]


In [14]:
encoded = tokenizer("Hello, world!", padding="max_length", max_length=10)
print(encoded)
# Padding to max length.

{'input_ids': [101, 7592, 1010, 2088, 999, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}


In [15]:
encoded = tokenizer("Hello, this is a very long sentence.", truncation=True, max_length=5)
print(encoded)
# Truncation to max length.

{'input_ids': [101, 7592, 1010, 2023, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}


In [11]:
batch = tokenizer(["Hello!", "This is longer"], padding=True)
print(batch)
# Padding to the longest sequence.

{'input_ids': [[101, 7592, 999, 102, 0], [101, 2023, 2003, 2936, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]}


In [14]:
encoded = tokenizer("Hello, world!", return_tensors="pt")  # PyTorch
print(encoded)
# Returns PyTorch tensors.

{'input_ids': tensor([[ 101, 7592, 1010, 2088,  999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [18]:
encoded = tokenizer("Hello", add_special_tokens=True)
print(encoded)
# Adds special tokens.

{'input_ids': [101, 7592, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}


In [19]:
tokenizer.pad_token = "<pad>"
print(tokenizer.pad_token)
# Sets padding token.

<pad>


In [20]:
tokenizer.unk_token = "<unk>"
print(tokenizer.unk_token)
# Sets unknown token.

<unk>


In [21]:
tokenizer.do_lower_case = True
print(tokenizer.do_lower_case)
# Sets lower case.

True


In [22]:
print(tokenizer.get_vocab())
# Gets the vocabulary.



In [23]:
print(tokenizer.convert_tokens_to_ids("hello"))
# Converts tokens to IDs.

7592


In [24]:
print(tokenizer.convert_ids_to_tokens(101))  # Example for BERT
# Converts IDs to tokens.

[CLS]


In [None]:
encoded = tokenizer(["Hello", "How are you?"], padding=True, return_tensors="pt")
print(encoded)
# Padding to the longest sequence.

In [None]:
encoded = tokenizer("Hello", return_attention_mask=True)
print(encoded["attention_mask"])
# Returns attention mask.

[1, 1, 1]


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True, cache_dir="./cache")
print(tokenizer)
# Uses fast tokenizer.

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [None]:
encoded = tokenizer("Hello world!", return_offsets_mapping=True) # offsets_mapping returns the start and end positions of each token in the original string.
print(encoded["offset_mapping"])
# Returns offset mapping.

[(0, 0), (0, 5), (6, 11), (11, 12), (0, 0)]


In [None]:
encoded = tokenizer(["Hello", "world!"], is_split_into_words=True)
print(encoded)
# Splits input into words.

{'input_ids': [101, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
