In [2]:
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

env: ALL_PROXY=http://127.0.0.1:7890
env: HTTP_PROXY=http://127.0.0.1:7890
env: HTTPS_PROXY=http://127.0.0.1:7890


In [3]:
%env HF_HUB_CACHE=./data/hf_cache

env: HF_HUB_CACHE=./data/hf_cache


# Tokenizer's basic usage

In [1]:
from transformers import AutoTokenizer

In [34]:
sentence = "To be or not to be, this is a question."

## Step 01 : Loading and saving

In [5]:
# Loading from huggingface, input the model's name, then can load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [6]:
# Saving the tokenizer to the local disk
tokenizer.save_pretrained("./outs/temp_tokenizer")

('./outs/temp_tokenizer/tokenizer_config.json',
 './outs/temp_tokenizer/special_tokens_map.json',
 './outs/temp_tokenizer/vocab.txt',
 './outs/temp_tokenizer/added_tokens.json',
 './outs/temp_tokenizer/tokenizer.json')

In [7]:
# Loading from the local disk
tokenizer = AutoTokenizer.from_pretrained("./outs/temp_tokenizer")
tokenizer

DistilBertTokenizerFast(name_or_path='./outs/temp_tokenizer', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## Step 02 : Tokenizing the sentence

In [13]:
tokens = tokenizer.tokenize(sentence)
tokens

['to', 'be', 'or', 'not', 'to', 'be', ',', 'this', 'is', 'a', 'question', '.']

## Step 03 : Show the vocab

In [14]:
tokenizer.vocab

{'inaugural': 7725,
 '##ener': 24454,
 'bells': 10118,
 '[unused939]': 944,
 'orderly': 23589,
 '[unused33]': 34,
 'competed': 3879,
 'dil': 29454,
 'ale': 15669,
 'tai': 13843,
 'deprived': 17676,
 'skinned': 19937,
 'sidewalk': 11996,
 'elsie': 24603,
 'ァ': 1692,
 '##dm': 22117,
 '[unused598]': 603,
 '##imov': 25299,
 'realization': 12393,
 '##sell': 23836,
 'sparkled': 28092,
 'zu': 16950,
 '和': 1796,
 '疒': 1913,
 'deficit': 15074,
 'lyric': 13677,
 'nursery': 13640,
 'span': 8487,
 'citizen': 6926,
 'ং': 1346,
 'contributors': 16884,
 'themselves': 3209,
 'residue': 21755,
 '##int': 18447,
 '##tung': 21847,
 '157': 17403,
 'chords': 18495,
 'exploit': 18077,
 'auditioned': 23008,
 'famously': 18172,
 'strata': 22913,
 'muscular': 13472,
 'reese': 15883,
 'mph': 5601,
 'rectory': 24606,
 'adjunct': 20621,
 'serpent': 16517,
 'diego': 5277,
 '##ggs': 21314,
 'polished': 12853,
 'bidding': 17534,
 'limp': 14401,
 'refurbishment': 24478,
 'privacy': 9394,
 '##yle': 12844,
 'boundaries'

In [15]:
tokenizer.vocab_size

30522

## Step 04 : Mutual converting

In [17]:
# Convert the token to token id
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[2000, 2022, 2030, 2025, 2000, 2022, 1010, 2023, 2003, 1037, 3160, 1012]

In [18]:
# Convert the token id to token
tokens = tokenizer.convert_ids_to_tokens(token_ids)
tokens

['to', 'be', 'or', 'not', 'to', 'be', ',', 'this', 'is', 'a', 'question', '.']

In [19]:
# Convert the token to string
str_sentence = tokenizer.convert_tokens_to_string(tokens)
str_sentence

'to be or not to be, this is a question.'

Better ways

In [20]:
# Convert the string to token ids
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 2000,
 2022,
 2030,
 2025,
 2000,
 2022,
 1010,
 2023,
 2003,
 1037,
 3160,
 1012,
 102]

In [21]:
# Convert the token ids to string
str_sentence = tokenizer.decode(token_ids)
str_sentence

'[CLS] to be or not to be, this is a question. [SEP]'

## Step 05 : Padding and truncation

In [25]:
# Padding
token_ids = tokenizer.encode(sentence, padding='max_length', max_length=24)
token_ids

[101,
 2000,
 2022,
 2030,
 2025,
 2000,
 2022,
 1010,
 2023,
 2003,
 1037,
 3160,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [26]:
# Truncation
token_ids = tokenizer.encode(sentence, truncation=True, max_length=8)
token_ids

[101, 2000, 2022, 2030, 2025, 2000, 2022, 102]

## Step 06 : Other inputs

In [27]:
attention_mask = [1 if idx != 0 else 0 for idx in token_ids]
token_type_ids = [0] * len(token_ids)
token_ids, attention_mask, token_type_ids

([101, 2000, 2022, 2030, 2025, 2000, 2022, 102],
 [1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0])

## Step 07 : Fast calling way

In [28]:
inputs = tokenizer.encode_plus(sentence, padding='max_length', max_length=24)
inputs

{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2023, 2003, 1037, 3160, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [29]:
inputs = tokenizer(sentence, padding='max_length', max_length=24)
inputs

{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2023, 2003, 1037, 3160, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

## Step 08 : Deal with batch data

In [30]:
sentences = [
    'To be or not to be, this is a question.',
    'This or this',
    'Who are you ?',
]

res = tokenizer(sentences, padding='max_length', max_length=24)
res

{'input_ids': [[101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2023, 2003, 1037, 3160, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2023, 2030, 2023, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2040, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [38]:
%%time
for _ in range(1_000):
    tokenizer(sentence)

CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 90.5 ms


In [44]:
%%time
_ = tokenizer([sentence] * 1_000)

CPU times: user 137 ms, sys: 11.3 ms, total: 148 ms
Wall time: 17.4 ms


# Fast / Slow Tokenizer

In [45]:
sentence = "To be or not to be, this is a question."

In [46]:
fast_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", use_fast=True) # using fast defaultly, use_fast=True is not necessary
fast_tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [47]:
slow_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", use_fast=False) # using slow tokenizer
slow_tokenizer

DistilBertTokenizer(name_or_path='distilbert/distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [48]:
%%time
for _ in range(1_000):
    fast_tokenizer(sentence)

CPU times: user 95.8 ms, sys: 3.52 ms, total: 99.4 ms
Wall time: 97.8 ms


In [49]:
%%time
for _ in range(1_000):
    slow_tokenizer(sentence)

CPU times: user 258 ms, sys: 3.66 ms, total: 262 ms
Wall time: 261 ms


In [50]:
%%time
_ = fast_tokenizer([sentence] * 1_000)

CPU times: user 151 ms, sys: 2.61 ms, total: 154 ms
Wall time: 16.7 ms


In [51]:
%%time
_ = slow_tokenizer([sentence] * 1_000)

CPU times: user 240 ms, sys: 295 μs, total: 241 ms
Wall time: 239 ms


## Offset mapping

In [52]:
inputs = fast_tokenizer(sentence, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2023, 2003, 1037, 3160, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 2), (3, 5), (6, 8), (9, 12), (13, 15), (16, 18), (18, 19), (20, 24), (25, 27), (28, 29), (30, 38), (38, 39), (0, 0)]}

In [53]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]

In [54]:
inputs = slow_tokenizer(sentence, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

# Load Special Tokenizer

In [55]:
from transformers import AutoTokenizer

In [58]:
tokenizer = AutoTokenizer.from_pretrained('thu-coai/ShieldLM-6B-chatglm3', trust_remote_code=True)
tokenizer

A new version of the following files was downloaded from https://huggingface.co/thu-coai/ShieldLM-6B-chatglm3:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ChatGLMTokenizer(name_or_path='thu-coai/ShieldLM-6B-chatglm3', vocab_size=64798, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	
}
)

In [59]:
tokenizer.save_pretrained('./outs/special_tokenizer')

('./outs/special_tokenizer/tokenizer_config.json',
 './outs/special_tokenizer/special_tokens_map.json',
 './outs/special_tokenizer/tokenizer.model',
 './outs/special_tokenizer/added_tokens.json')

In [60]:
tokenizer = AutoTokenizer.from_pretrained('./outs/special_tokenizer')

ValueError: The repository ./outs/special_tokenizer contains custom code which must be executed to correctly load the model. You can inspect the repository content at /home/ubuntu/MyFiles/GitHub/demos/transformers/outs/special_tokenizer .
 You can inspect the repository content at https://hf.co/./outs/special_tokenizer.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

In [61]:
tokenizer = AutoTokenizer.from_pretrained('./outs/special_tokenizer', trust_remote_code=True)