# Tokenizers



You can run this notebook on a free CPU, or locally on your box if you prefer.


In [4]:
from transformers import AutoTokenizer

# Sign in to Hugging Face


In [26]:
import os
from huggingface_hub import login

hf_token = os.environ.get('HF_TOKEN')


# Log into Hugging Face, adding credentials to git if needed
login(hf_token, add_to_git_credential=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Accessing Llama 3.1 from Meta

In order to use the fantastic Llama 3.1, Meta does require you to sign their terms of service.

Visit their model instructions page in Hugging Face:
https://huggingface.co/meta-llama/Meta-Llama-3.1-8B

At the top of the page are instructions on how to agree to their terms. If possible, you should use the same email as your huggingface account.

In my experience approval comes in a couple of minutes. Once you've been approved for any 3.1 model, it applies to the whole family of models.

If the next cell gives you an error, then please check:  
1. Are you logged in to HuggingFace? Try running `login()` to check your key works
2. Did you set up your API key with full read and write permissions?
3. If you visit the Llama3.1 page with the link above, does it show that you have access to the model near the top?


In [28]:
model = 'meta-llama/Meta-Llama-3.1-8B'

In [29]:
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)

In [None]:
text = "I am excited to show Tokenizers in action to my LLM engineers"

In [11]:
tokenizer(text)

{'input_ids': [128000, 40, 1097, 12304, 311, 1501, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:

tokens = tokenizer.encode(text)
tokens

[128000,
 40,
 1097,
 12304,
 311,
 1501,
 9857,
 12509,
 304,
 1957,
 311,
 856,
 445,
 11237,
 25175]

In [7]:
len(tokens)

15

In [8]:
tokenizer.decode(tokens)

'<|begin_of_text|>I am excited to show Tokenizers in action to my LLM engineers'

In [9]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' show',
 ' Token',
 'izers',
 ' in',
 ' action',
 ' to',
 ' my',
 ' L',
 'LM',
 ' engineers']

In [10]:
# tokenizer.vocab
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

In [19]:
data = [
  "I like cats.",
  "Do you like cats too?",
]

tokenizer.pad_token = tokenizer.eos_token
model_inputs = tokenizer.encode(data, padding=True, truncation=True)

model_inputs

[128000, 40, 1093, 19987, 13, 128000, 5519, 499, 1093, 19987, 2288, 30]

In [20]:
tokenizer.decode(model_inputs)

'<|begin_of_text|>I like cats.<|begin_of_text|>Do you like cats too?'

In [23]:
tokenizer.batch_decode(model_inputs)

['<|begin_of_text|>',
 'I',
 ' like',
 ' cats',
 '.',
 '<|begin_of_text|>',
 'Do',
 ' you',
 ' like',
 ' cats',
 ' too',
 '?']

In [22]:
tokenizer.batch_decode(model_inputs, skip_special_tokens=True)

['',
 'I',
 ' like',
 ' cats',
 '.',
 '',
 'Do',
 ' you',
 ' like',
 ' cats',
 ' too',
 '?']

# Demonstrates how to create and compare three different tokenizers from Hugging Face 

In [30]:
# ============================
#  2) Imports
# ============================
from transformers import AutoTokenizer
import torch

# We define three model names for demonstration.
# You can replace these with any other models (including custom or private ones).
model_names = [
    "meta-llama/Meta-Llama-3.1-8B",
    "Qwen/Qwen2-7B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]



In [31]:
# ============================
#  3) Create Tokenizers
# ============================
tokenizers = {}
for name in model_names:
    print(f"Loading tokenizer for: {name}")
    # trust_remote_code=True allows the library to trust custom code in the model repo.
    # This is necessary if a model has custom tokenizers or architecture code.
    tokenizers[name] = AutoTokenizer.from_pretrained(name, trust_remote_code=True)


Loading tokenizer for: meta-llama/Meta-Llama-3.1-8B
Loading tokenizer for: Qwen/Qwen2-7B-Instruct


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Loading tokenizer for: deepseek-ai/DeepSeek-R1-Distill-Llama-8B


tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

In [41]:
# ============================
#  4) Single Encode/Decode (Display up to 12 vocab entries)
# ============================
test_sentence = "Hello world! This is a test sentence."

encoded_data = {}
for name in model_names:
    tk = tokenizers[name]

    print(f"--- {name} ---")

    # Get any tokens that were manually added to the tokenizer
    added_vocab = tk.get_added_vocab()

    if added_vocab:
        # Convert dict items to a list so we can slice the first 12
        added_vocab_list = list(added_vocab.items())[:15]
        print(f"Added vocabulary (showing up to 15 entries): {added_vocab_list}")
    else:
        print("No added vocabulary entries.")

    # Encode the test sentence
    token_ids = tk.encode(test_sentence)
    encoded_data[name] = token_ids
    print(f"Token IDs: {token_ids}")

    # Decode back to text
    decoded_text = tk.decode(token_ids)
    print(f"Decoded Text: {decoded_text}")
    print()  # Blank line for readability


--- meta-llama/Meta-Llama-3.1-8B ---
Added vocabulary (showing up to 15 entries): [('<|begin_of_text|>', 128000), ('<|end_of_text|>', 128001), ('<|reserved_special_token_0|>', 128002), ('<|reserved_special_token_1|>', 128003), ('<|finetune_right_pad_id|>', 128004), ('<|reserved_special_token_2|>', 128005), ('<|start_header_id|>', 128006), ('<|end_header_id|>', 128007), ('<|eom_id|>', 128008), ('<|eot_id|>', 128009), ('<|python_tag|>', 128010), ('<|reserved_special_token_3|>', 128011), ('<|reserved_special_token_4|>', 128012), ('<|reserved_special_token_5|>', 128013), ('<|reserved_special_token_6|>', 128014)]
Token IDs: [128000, 9906, 1917, 0, 1115, 374, 264, 1296, 11914, 13]
Decoded Text: <|begin_of_text|>Hello world! This is a test sentence.

--- Qwen/Qwen2-7B-Instruct ---
Added vocabulary (showing up to 15 entries): [('<|endoftext|>', 151643), ('<|im_start|>', 151644), ('<|im_end|>', 151645)]
Token IDs: [9707, 1879, 0, 1096, 374, 264, 1273, 11652, 13]
Decoded Text: Hello world! This 

# Batch Encoding & Decoding

In [40]:
# ============================
#  5) Batch Encoding & Decoding
# ============================
# Let's create a list of sentences to process all at once.
sentences = [
    "Hello world!",
    "นกน้อยบินกลับไปยังรัง",
    "Testing batch tokenization.",
    "Short sentence."
]

batch_encoded_data = {}
for name in model_names:
    tk = tokenizers[name]

    # Ensure we have a pad token set before using padding
    if tk.pad_token is None:
        tk.pad_token = tk.eos_token  # or tk.add_special_tokens({'pad_token': '[PAD]'})

    # Using tokenizer(...) directly can handle multiple sentences (batch mode).
    # We'll also enable padding and truncation for consistency.
    batch_results = tk(
        sentences,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    # Store the input IDs for later decoding
    batch_encoded_data[name] = batch_results["input_ids"]
    
    print(f"--- {name} Batch Encoding ---")
    print(f"Input IDs shape: {batch_results['input_ids'].shape}")
    print(f"First Sentence Input IDs: {batch_results['input_ids'][0].tolist()}")
    print()

# Now we batch decode the same IDs
for name in model_names:
    tk = tokenizers[name]

    # Again, ensure pad_token is set
    if tk.pad_token is None:
        tk.pad_token = tk.eos_token

    # batch_decode() takes a list (or tensor) of sequences of token IDs
    batch_decoded = tk.batch_decode(batch_encoded_data[name])
    
    print(f"--- {name} Batch Decoded ---")
    for i, text in enumerate(batch_decoded):
        print(f"Sentence {i+1}: {text}")
    print()


--- meta-llama/Meta-Llama-3.1-8B Batch Encoding ---
Input IDs shape: torch.Size([4, 12])
First Sentence Input IDs: [128000, 9906, 1917, 0, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001]

--- Qwen/Qwen2-7B-Instruct Batch Encoding ---
Input IDs shape: torch.Size([4, 13])
First Sentence Input IDs: [9707, 1879, 0, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643]

--- deepseek-ai/DeepSeek-R1-Distill-Llama-8B Batch Encoding ---
Input IDs shape: torch.Size([4, 12])
First Sentence Input IDs: [128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128000, 9906, 1917, 0]

--- meta-llama/Meta-Llama-3.1-8B Batch Decoded ---
Sentence 1: <|begin_of_text|>Hello world!<|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|>
Sentence 2: <|begin_of_text|>นกน้อยบินกลับไปยังรัง
Sentence 3: <|begin_of_text|>Testing batch tokenization.<|end_of_text|><|end_of_text|><|end_of_text|><|end_of_

# Instruct variants of models

Many models have a variant that has been trained for use in Chats.  
These are typically labelled with the word "Instruct" at the end.  
They have been trained to expect prompts with a particular format that includes system, user and assistant prompts.  

There is a utility method `apply_chat_template` that will convert from the messages list format we are familiar with, into the right input prompt for this model.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:

messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>




### example showing how you might integrate multiple models

In [51]:
from transformers import AutoTokenizer

model_names = [
    "meta-llama/Meta-Llama-3.1-8B",
    "Qwen/Qwen2-7B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]

messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

for model_name in model_names:
    print(f"\n=== Testing {model_name} ===")
    try:
        # 1) Load the tokenizer (trust_remote_code=True is crucial for custom code)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        # 2) Apply the chat template
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # 3) Print the resulting prompt
        print("Prompt:\n", prompt)
    except Exception as e:
        print(f"Error for model '{model_name}': {e}")



=== Testing meta-llama/Meta-Llama-3.1-8B ===
Error for model 'meta-llama/Meta-Llama-3.1-8B': Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

=== Testing Qwen/Qwen2-7B-Instruct ===
Prompt:
 <|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke for a room of Data Scientists<|im_end|>
<|im_start|>assistant


=== Testing deepseek-ai/DeepSeek-R1-Distill-Llama-8B ===
Prompt:
 <｜begin▁of▁sentence｜>You are a helpful assistant<｜User｜>Tell a light-hearted joke for a room of Data Scientists<｜Assistant｜>


## encode and decode

In [53]:
from transformers import AutoTokenizer

model_names = [
    "meta-llama/Meta-Llama-3.1-8B",
    "Qwen/Qwen2-7B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]

messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

for model_name in model_names:
    print(f"\n=== Testing {model_name} ===")
    try:
        # 1) Load the tokenizer (trust_remote_code=True is crucial for custom code)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        # 2) Apply the chat template (creates a single string prompt)
        prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        print("Prompt:")
        print(prompt)
        
        # 3) Encode the prompt
        encoded = tokenizer(prompt, return_tensors="pt")  # or tokenizer.encode(...)
        print("Encoded (input_ids):")
        print(encoded["input_ids"])

        # 4) Decode back to text
        decoded = tokenizer.batch_decode(encoded["input_ids"], skip_special_tokens=False)
        print()
        print("Decoded Text:")
        print(decoded[0])

    except Exception as e:
        print(f"Error for model '{model_name}': {e}")



=== Testing meta-llama/Meta-Llama-3.1-8B ===
Error for model 'meta-llama/Meta-Llama-3.1-8B': Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

=== Testing Qwen/Qwen2-7B-Instruct ===
Prompt:
<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke for a room of Data Scientists<|im_end|>
<|im_start|>assistant

Encoded (input_ids):
tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847, 151645,
            198, 151644,    872,    198,  40451,    264,   3100,  69295,  21646,
            369,    264,   3054,    315,   2885,  56016, 151645,    198, 151644,
          77091,    198]])

Decoded Text:
<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light