In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login
import os
from dotenv import load_dotenv

#ANALYSIS OF THE HUGGING FACE TOKENIZER CONFIGURATION FOR THE SELECTED MODELS

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

model_name = "mistralai/Mistral-7B-v0.1"
#"mistralai/Mistral-7B-v0.1"
#"meta-llama/Llama-3.1-8B"
#"google/gemma-2-9b"
#"deepseek-ai/DeepSeek-R1"
#"bigscience/bloom"
#"microsoft/phi"

In [None]:
#load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#try to extract the fast tokenizer backend (if available)
if hasattr(tokenizer, "backend_tokenizer"):
    backend = tokenizer.backend
    model_type = type(backend.model)
    pretokenizer_type = type(backend.pre_tokenizer)
    postprocessor_type = type(backend.post_processor)
else:
    backend = None
    model_type = pretokenizer_type = postprocessor_type = None

#general info
print(f"Model: {model_name}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Tokenizer type: {type(tokenizer).__name__}")
print(f"Case-sensitive: {not getattr(tokenizer, 'do_lower_case', False)}")

#backend tokenizer details
print("Tokenizer backend:")
print(f"Model: {model_type}")
print(f"Pretokenizer: {pretokenizer_type}")
print(f"Postprocessor: {postprocessor_type}")

#special tokens
print("special tokens:")
for name, token in tokenizer.special_tokens_map.items():
    print(f"  {name}: {token}")

In [None]:
#dig deeper into the pretokenizer, tokenizer and normalizer:

#Normalizer:
normalizer = backend.normalizer
print("Normalizer:")
print(repr(normalizer), "\n")

#Pretokenizer:
pretokenizer = backend.pre_tokenizer
print("PreTokenizer:")
print(repr(pretokenizer), "\n")

#Postprocessor:
postprocessor = backend.post_processor
print("PostProcessor:")
print(repr(postprocessor), "\n")

#Model:
model = backend.model
print("Model (BPE, WordPiece, etc):")
print(repr(model), "\n")

In [None]:
#examine the vocabulary:

#get the vocabulary as a dictionary {token: id}
vocab_dict = tokenizer.get_vocab()

#sort tokens by their id
vocab_list = sorted(vocab_dict.items(), key=lambda x: x[1])  # list of (token, id)

#save to a .txt file
with open("vocab.txt", "w", encoding="utf-8") as f:
    for token, token_id in vocab_list:
        f.write(f"{token}\t{token_id}\n")

print(f"Vocabulary extracted: {len(vocab_list)} tokens saved in vocab.txt")

In [None]:
#compare two tokenizers (to check if different versions of the model share the same tokenizer):

#load the two tokenizers
tokenizer1 = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
tokenizer2 = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3")

#list of test sentences
test_sentences = [
    "Hello world!",
    "This is a test.",
    "Tokenization can be tricky...",
    "Let's compare two tokenizers.",
    "😊 Unicode and symbols #hashtag",
    "Don't stop believing.",
    "test@example.com.",
    "C'est la vie — that’s life.",
    "¿Cómo estás? ¡Muy bien!",
    "中文分词测试",
    "I have 2 dogs and 3 cats.",
    "Newlines\nshould\nalso\nbe\ntested.",
    "He said, 'Hello!' and left.",
    "Hyphenated-words can be tricky.",
    "1234567890 numbers test",
    "a" * 300,  # very long word
    "Mix of CAPS and lowercase.",
    "Emojis 🤔🔥🚀",
    "Some_math_symbols + − × ÷ = ≠",
    "URLs like https://huggingface.co",
    "file_name_with_underscores.py"
]

#compare tokenization results
for sentence in test_sentences:
    tokens1 = tokenizer1.tokenize(sentence)
    tokens2 = tokenizer2.tokenize(sentence)
    if tokens1 != tokens2:
        print(f"Difference found in: '{sentence}'")
        print(f"Tokenizer 1: {tokens1}")
        print(f"Tokenizer 2: {tokens2}\n")
    else:
        print(f"Same result for: '{sentence}'")

#check if all tokenizations are identical
all_equal = all(tokenizer1.tokenize(s) == tokenizer2.tokenize(s) for s in test_sentences)
print("\nAre all results the same?:", all_equal)