### Install required library

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer

In [None]:
# define the sentence to tokenize
sentence = "It's a good day"

In [None]:
# load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# apply the tokenizer to the sentence and extract the token ids
token_ids = tokenizer(sentence).input_ids

In [None]:
print(token_ids)

[101, 1135, 112, 188, 170, 1363, 1285, 102]


To map each token ID to its corresponding token, we will use the `decode` method of the tokenizer.

In [None]:
for id in token_ids:
    print(tokenizer.decode(id))

[CLS]
It
'
s
a
good
day
[SEP]


## Visualizing Tokenization

In this section, you'll wrap the code of the previous section in the function `show_tokens`. The function takes in a text and the model name, and prints the vocabulary length of the tokenizer and a colored list of the tokens.

In [None]:
# A list of colors in RGB for representing the tokens
colors = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens_color(sentence: str, tokenizer_name: str):
    """ Show the tokens each separated by a different color """

    # Load the tokenizer and tokenize the input
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids

    # Extract vocabulary length
    print(f"Vocab length: {len(tokenizer)}")

    # Print a colored list of tokens
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors[idx % len(colors)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

#### Tokens without color

In [None]:
def show_tokens_without_color(sentence: str, tokenizer_name: str):
    """ Show the tokens in a plain format """

    # Load the tokenizer and tokenize the input
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids

    # Extract vocabulary length
    print(f"Vocab length: {len(tokenizer)}")

    # Print a plain list of tokens
    for t in token_ids:
        print(tokenizer.decode(t), end=' ')


Here's the text that you'll use to explore the different tokenization strategies of each model.

In [None]:
text = """
English and CAPITALIZATION
🎵 鸟
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

We will use the tokenizer of `bert-base-cased` and compare its tokenization strategy to that of other models

**bert-base-cased**

In [None]:
show_tokens_color(text, "bert-base-cased")

Vocab length: 28996
[0;30;48;2;102;194;165m[CLS][0m [0;30;48;2;252;141;98mEnglish[0m [0;30;48;2;141;160;203mand[0m [0;30;48;2;231;138;195mCA[0m [0;30;48;2;166;216;84m##PI[0m [0;30;48;2;255;217;47m##TA[0m [0;30;48;2;102;194;165m##L[0m [0;30;48;2;252;141;98m##I[0m [0;30;48;2;141;160;203m##Z[0m [0;30;48;2;231;138;195m##AT[0m [0;30;48;2;166;216;84m##ION[0m [0;30;48;2;255;217;47m[UNK][0m [0;30;48;2;102;194;165m[UNK][0m [0;30;48;2;252;141;98mshow[0m [0;30;48;2;141;160;203m_[0m [0;30;48;2;231;138;195mtoken[0m [0;30;48;2;166;216;84m##s[0m [0;30;48;2;255;217;47mF[0m [0;30;48;2;102;194;165m##als[0m [0;30;48;2;252;141;98m##e[0m [0;30;48;2;141;160;203mNone[0m [0;30;48;2;231;138;195mel[0m [0;30;48;2;166;216;84m##if[0m [0;30;48;2;255;217;47m=[0m [0;30;48;2;102;194;165m=[0m [0;30;48;2;252;141;98m>[0m [0;30;48;2;141;160;203m=[0m [0;30;48;2;231;138;195melse[0m [0;30;48;2;166;216;84m:[0m [0;30;48;2;255;217;47mtwo[0m [0;30;48;2;102;194;165mta[

In [None]:
show_tokens_without_color(text, "bert-base-cased")

Vocab length: 28996
[CLS] English and CA ##PI ##TA ##L ##I ##Z ##AT ##ION [UNK] [UNK] show _ token ##s F ##als ##e None el ##if = = > = else : two ta ##bs : " " Three ta ##bs : " " 12 . 0 * 50 = 600 [SEP] 

**bert-base-uncased**

In [None]:
show_tokens_color(text, "bert-base-uncased")

**GPT-4**

In [None]:
show_tokens_color(text, "Xenova/gpt-4")

**GPT-2**

In [None]:
show_tokens_color(text, "gpt2")

**Flan-T5-small**

In [None]:
show_tokens_color(text, "google/flan-t5-small")

**Starcoder 2-15b**

In [None]:
show_tokens_color(text, "bigcode/starcoder2-15b")

**Phi-3**

In [None]:
show_tokens_color(text, "microsoft/Phi-3-mini-4k-instruct")

**Qwen2**

In [None]:
show_tokens_color(text, "Qwen/Qwen2-VL-7B-Instruct")