In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True
)
print(model)


In [None]:
from transformers import AutoTokenizer
import re

# 1) Load the official Qwen-3.0-0.6B tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

# 2) Grab its vocabulary dict: { token_string → token_id }
vocab = tokenizer.get_vocab()


Qwen-3.0-0.6B vocab size: 151669
  → Hindi tokens:    0
  → Gujarati tokens: 0

Sample Hindi tokens: []
Sample Gujarati tokens: []


In [9]:
devanagari = re.compile(r"[\u0900-\u097F]")  # Devanagari (Hindi)
gujarati  = re.compile(r"[\u0A80-\u0AFF]")  # Gujarati

# 3. Filter tokens
hindi_tokens    = [(tok, vocab[tok]) for tok in vocab if devanagari.search(tok)]
gujarati_tokens = [(tok, vocab[tok]) for tok in vocab if gujarati.search(tok)]

# 4. Print results
print(f"Hindi tokens: {len(hindi_tokens)}\n")
for tok, idx in hindi_tokens:
    # drop the GPT2 whitespace marker (Ġ, U+0120) for readability
    display = tok.lstrip("Ġ")
    print(f"{display:12} → {idx}")

print("\n" + "="*40 + "\n")

print(f"Gujarati tokens: {len(gujarati_tokens)}\n")
for tok, idx in gujarati_tokens:
    display = tok.lstrip("Ġ")
    print(f"{display:12} → {idx}")




Hindi tokens: 0



Gujarati tokens: 0



In [13]:
import json
from pathlib import Path

vocab_path = Path("vocab.json")
with open(vocab_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)

for k, v in vocab.items():
    print(k)
    

# print number of tokens that contain "ગુજરાતી"

Ġdining
-input
Ġcertifications
_positions
æµ°
membership
Ġbehavioral
.Protocol
Ġfila
Ġpurge
Õ³
_components
åı¯è¾¾
_pro
ĠBrexit
]>=
Ð·Ð½
KR
Ã¤nder
èĻ¢
.SystemColors
ðŁĵĨ
ĠReduce
Ġcarrying
loops
Ï¬
push
æĥ¦
hints
clubs
Ġimagining
.ModelForm
cpu
ĠStage
å¤§éĹ¨
ĠNetworks
á¾Ĺ
ĠDetect
ÐµÑģÑĤÐ¸
ĠHEAD
Ġperman
ĠÐ±Ð»
Ð´ÐµÑģÑı
Daily
_suspend
æĸĩæľ¬
à¸ķà¸²à¸¡
ç±ģ
åħ¼èģĮ
ï¿½ï¿½
Ġcatering
ë´ĩ
.ci
.Models
ĠControls
_att
ching
Ġngá»ĵi
åıĭè°Ĭ
ĠCarla
UIKit
Machine
,password
_chunks
ortho
?key
è´¸æĺĵæĪĺ
Ì
mph
%/
really
èµŀåĬ©
Ġ\$
Ġcuent
ìĬ¤íĥĢ
ĠFlexible
æĺĤ
ĠiÃ§er
.sc
SAM
Ġcrear
=add
å¯¹æĪĳä»¬
ÑİÑīÐ°Ñı
è¿Ļä»¶
.proc
ains
unist
Ġsendo
ĉattr
bourne
/storage
ecz
èŀł
ðĿļĲ
äº§éĶĢ
rest
ĠCRS
ĠGenius
Ġraise
ĠsÃ´ng
å±łæĿĢ
ĠPanc
Ġslightest
Ġbiblical
Ġthrilling
ĠForbes
Ġcanceled
éĴ¼
Ġliability
ĠÃ¼rÃ¼n
Toast
Ġarbit
Ġ×¢×ľ×Ļ×ķ
Ġë²Ī
ÙħØ´Ø§
ä¸Ģä¸ªå¤ļ
Liverpool
åķĬ
å¤ĦçĲĨåĻ¨
çĽ¸å¤Ħ
ĠBrushes
æľĢåĲİ
Ġmacros
ĠĠĠĠĉĉĉ
tod
ĠHIT
/socket
ĠQMessageBox
Ġevolution
SOC
ribbon
,ĊĊĊĊ
éĢ¡
ìĿ·
ĩ
à¸Ħà¸£à¸µ
ĠKapoor
Season
checkbox
ðĿĲļ
ãĤīã

In [2]:
from transformers import AutoTokenizer

def analyze_tokenizer_language_support(tokenizer_name: str, languages: dict):
    """
    Analyzes how well a Hugging Face tokenizer supports specific languages.
    
    This works by tokenizing sample sentences and inspecting the resulting tokens.
    For languages tokenized into many individual bytes, support is poor.
    For languages tokenized into meaningful subwords, support is good.
    """
    print(f"--- Analyzing tokenizer: {tokenizer_name} ---\n")
    try:
        # Qwen's tokenizer requires trusting remote code to load correctly
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
    except Exception as e:
        print(f"Failed to load tokenizer. Error: {e}")
        return

    for lang, sentence in languages.items():
        print(f"--- Language: {lang} ---")
        print(f"Sample sentence: '{sentence}'")

        # 1. Tokenize the sentence to get input IDs
        input_ids = tokenizer.encode(sentence)
        
        # 2. Convert the IDs back to human-readable token strings
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        # 3. Perform analysis
        num_chars = len(sentence)
        num_tokens = len(tokens)
        
        # A high ratio indicates byte-level fallback, which is inefficient.
        # e.g., a 3-byte UTF-8 char often becomes 3 tokens -> ratio of 3.0
        # A ratio near 1.0 means good, subword tokenization.
        compression_ratio = num_tokens / (num_chars + 1e-6) # Add epsilon for safety

        print(f"Generated tokens: {tokens}")
        print(f"Number of characters: {num_chars}")
        print(f"Number of tokens:     {num_tokens}")
        print(f"Compression ratio (tokens/char): {compression_ratio:.2f}")

        if compression_ratio > 1.5:
             print("Result: POOR support. The tokenizer is falling back to individual bytes.")
        else:
             print("Result: GOOD support. The tokenizer uses meaningful subwords.")
        print("-" * (len(lang) + 16) + "\n")


if __name__ == '__main__':
    # Define the tokenizer and the languages/sentences to test
    QWEN_TOKENIZER = "sarvamai/sarvam-m"
    
    LANGUAGES_TO_TEST = {
        "English": "The quick brown fox jumps over the lazy dog.",
        "Hindi": "भारत एक विशाल और विविधतापूर्ण देश है।", # "India is a vast and diverse country."
        "Gujarati": "ગુજરાત ભારતના પશ્ચિમ કિનારે આવેલું રાજ્ય છે.", # "Gujarat is a state on the west coast of India."
    }

    analyze_tokenizer_language_support(QWEN_TOKENIZER, LANGUAGES_TO_TEST)


--- Analyzing tokenizer: sarvamai/sarvam-m ---



tokenizer_config.json:   0%|          | 0.00/201k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/21.3k [00:00<?, ?B/s]

--- Language: English ---
Sample sentence: 'The quick brown fox jumps over the lazy dog.'
Generated tokens: ['<s>', 'The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Number of characters: 44
Number of tokens:     11
Compression ratio (tokens/char): 0.25
Result: GOOD support. The tokenizer uses meaningful subwords.
-----------------------

--- Language: Hindi ---
Sample sentence: 'भारत एक विशाल और विविधतापूर्ण देश है।'
Generated tokens: ['<s>', 'à¤Ń', 'à¤¾à¤°à¤¤', 'Ġà¤ıà¤ķ', 'Ġà¤µà¤¿à¤¶', 'à¤¾à¤²', 'Ġà¤Ķà¤°', 'Ġà¤µà¤¿à¤µ', 'à¤¿à¤§', 'à¤¤', 'à¤¾à¤ª', 'à¥Ĥ', 'à¤°à¥įà¤£', 'Ġà¤¦à¥ĩà¤¶', 'Ġà¤¹à¥Ī', 'à¥¤']
Number of characters: 37
Number of tokens:     16
Compression ratio (tokens/char): 0.43
Result: GOOD support. The tokenizer uses meaningful subwords.
---------------------

--- Language: Gujarati ---
Sample sentence: 'ગુજરાત ભારતના પશ્ચિમ કિનારે આવેલું રાજ્ય છે.'
Generated tokens: ['<s>', 'àªĹ', 'à«ģàª', 'ľ', 'àª°', 'àª¾àª¤', 'ĠàªŃ', 'àª¾àª°', 'àª¤', 'àª¨àª

In [None]:
# sarvamai/sarvam-1