In [6]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load the .env file
load_dotenv()

# Get the HF token from the .env file
hf_token = os.getenv("HF_TOKEN")

# Login to Hugging Face
login(token=hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/sagdesai/.cache/huggingface/token
Login successful


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

In [8]:
# Get the vocabulary
vocab = tokenizer.get_vocab()


total_vocab_size = len(vocab)

# Print the total number of tokens
print(f"Total number of tokens: {total_vocab_size}")


Total number of tokens: 256000


#### Print first 500 and last 500 tokens from the tokenizer

In [9]:
# Sort the vocabulary by token ID
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

# Print the first 500 tokens and their token IDs
print("First 500 tokens and their token IDs:")
for token, token_id in sorted_vocab[:500]:
    print(f"Token: {token}, Token ID: {token_id}")

First 500 tokens and their token IDs:
Token: <pad>, Token ID: 0
Token: <eos>, Token ID: 1
Token: <bos>, Token ID: 2
Token: <unk>, Token ID: 3
Token: <mask>, Token ID: 4
Token: <2mass>, Token ID: 5
Token: [@BOS@], Token ID: 6
Token: <unused0>, Token ID: 7
Token: <unused1>, Token ID: 8
Token: <unused2>, Token ID: 9
Token: <unused3>, Token ID: 10
Token: <unused4>, Token ID: 11
Token: <unused5>, Token ID: 12
Token: <unused6>, Token ID: 13
Token: <unused7>, Token ID: 14
Token: <unused8>, Token ID: 15
Token: <unused9>, Token ID: 16
Token: <unused10>, Token ID: 17
Token: <unused11>, Token ID: 18
Token: <unused12>, Token ID: 19
Token: <unused13>, Token ID: 20
Token: <unused14>, Token ID: 21
Token: <unused15>, Token ID: 22
Token: <unused16>, Token ID: 23
Token: <unused17>, Token ID: 24
Token: <unused18>, Token ID: 25
Token: <unused19>, Token ID: 26
Token: <unused20>, Token ID: 27
Token: <unused21>, Token ID: 28
Token: <unused22>, Token ID: 29
Token: <unused23>, Token ID: 30
Token: <unused24>, T

In [10]:
# Print the last 500 tokens and their token IDs
print("Last 500 tokens and their token IDs:")
for token, token_id in sorted_vocab[-500:]:
    print(f"Token: {token}, Token ID: {token_id}")

Last 500 tokens and their token IDs:
Token: Ìëè, Token ID: 255500
Token: Ìëò, Token ID: 255501
Token: ÓÇç, Token ID: 255502
Token: ÓÇø, Token ID: 255503
Token: ÓÉä, Token ID: 255504
Token: ÓÉ∏, Token ID: 255505
Token: ÓÜí, Token ID: 255506
Token: Óãû, Token ID: 255507
Token: Óã£, Token ID: 255508
Token: Óåà, Token ID: 255509
Token: Óîû, Token ID: 255510
Token: Óòé, Token ID: 255511
Token: Ó°Å, Token ID: 255512
Token: ÔÖ°, Token ID: 255513
Token: Ô¶æ, Token ID: 255514
Token: Ô∫Ä, Token ID: 255515
Token: ÔªÄ, Token ID: 255516
Token: ëÑù, Token ID: 255517
Token: ìÑπ, Token ID: 255518
Token: ìÜè, Token ID: 255519
Token: ñ°º, Token ID: 255520
Token: ùî®, Token ID: 255521
Token: ùï´, Token ID: 255522
Token: ùñ¢, Token ID: 255523
Token: ùñø, Token ID: 255524
Token: ùúì, Token ID: 255525
Token: üÖ§, Token ID: 255526
Token: üÖ∫, Token ID: 255527
Token: ü¶£, Token ID: 255528
Token: üßÜ, Token ID: 255529
Token: »é, Token ID: 255530
Token: ÕΩ, Token ID: 255531
Token: œ´, Token ID: 255

## Telugu Tokenizer Exploration

https://en.wikipedia.org/wiki/Telugu_(Unicode_block)

In [11]:
# Telugu Unicode block range. Refer to the wikipedia link image for Telugu unicode block range.
telugu_start = 0x0C00
telugu_end = 0x0C7F

import unicodedata

# Iterate through the vocabulary and identify tokens with Telugu characters
telugu_tokens = []
# for token, token_id in vocab.items():
#     if any(telugu_start <= ord(char) <= telugu_end for char in token):
#         telugu_tokens.append((token, token_id))

for token, token_id in vocab.items():
    if any(telugu_start <= ord(char) <= telugu_end for char in token):
        telugu_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "TELUGU" in name:
                    telugu_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Telugu characters
total_telugu_tokens = len(telugu_tokens)


print("Telugu tokens and their token IDs:")
for token, token_id in telugu_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Telugu tokens with their token IDs
print(f"Total number of tokens with Telugu characters: {total_telugu_tokens}")

telugu_percentage = (total_telugu_tokens / total_vocab_size) * 100
print(f"Percentage of Telugu tokens in the vocabulary: {telugu_percentage:.2f}%")

Telugu tokens and their token IDs:
Token: ‡±ç‡∞∞, Token ID: 34970
Token: ‡∞ó, Token ID: 237467
Token: ‚ñÅ‡∞¶, Token ID: 89331
Token: ‡∞®‡±Å, Token ID: 56508
Token: ‚ñÅ‡∞¨, Token ID: 85990
Token: ‡∞Ç‡∞ï, Token ID: 191046
Token: ‡∞§, Token ID: 236930
Token: ‚ñÅ‡∞µ‡∞ø, Token ID: 91918
Token: ‡∞ô, Token ID: 254013
Token: ‚ñÅ‡∞§‡±Ü, Token ID: 232117
Token: ‡∞†, Token ID: 244359
Token: ‡∞µ‡∞ø, Token ID: 97356
Token: ‡∞æ‡∞Ø, Token ID: 90966
Token: ‡∞ï‡∞∞‡±ç‡∞Ø, Token ID: 197972
Token: ‚ñÅ‡∞∏‡±å‡∞ï‡∞∞‡±ç‡∞Ø, Token ID: 198656
Token: ‡∞µ, Token ID: 236936
Token: ‡±Å, Token ID: 236336
Token: ‡∞®‡∞ø, Token ID: 37546
Token: ‡∞∏‡±Å, Token ID: 149344
Token: ‡∞è, Token ID: 241913
Token: ‡∞°‡±Å, Token ID: 83296
Token: ‡∞∞, Token ID: 236442
Token: ‚ñÅ‡∞∂, Token ID: 126865
Token: ‡∞¶‡±Å, Token ID: 140322
Token: ‡±´, Token ID: 249546
Token: ‡∞à, Token ID: 241557
Token: ‡∞ü, Token ID: 237499
Token: ‡∞™‡±ç‡∞™, Token ID: 221332
Token: ‡±®, Token ID: 249050
Token: ‡∞Ç‡∞ó‡∞æ, Token ID: 151061
Token: ‡±Ç, Token

In [12]:
unicodedata.name('A')

'LATIN CAPITAL LETTER A'

In [13]:
unicodedata.name('‡∞á')

'TELUGU LETTER I'

In [14]:
test_token = "‡∞∏‡±ç‡∞§‡±ç‡∞∞‡±Ä" #Woman - Sthree
for char in test_token:
  print (char)
  print (unicodedata.name(char))

‡∞∏
TELUGU LETTER SA
‡±ç
TELUGU SIGN VIRAMA
‡∞§
TELUGU LETTER TA
‡±ç
TELUGU SIGN VIRAMA
‡∞∞
TELUGU LETTER RA
‡±Ä
TELUGU VOWEL SIGN II


#### Hindi (Devanagari) Tokenizer Exploration

https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)

In [15]:
import unicodedata

# Devanagari Unicode block range
devanagari_start = 0x0900
devanagari_end = 0x097F

# Iterate through the vocabulary and identify tokens with Devanagari characters
devanagari_tokens = []
for token, token_id in vocab.items():
    if any(devanagari_start <= ord(char) <= devanagari_end for char in token):
        devanagari_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "DEVANAGARI" in name:
                    devanagari_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Devanagari characters
total_devanagari_tokens = len(devanagari_tokens)


print("Hindi (Devanagari) tokens and their token IDs:")
for token, token_id in devanagari_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Devanagari tokens with their token IDs
print(f"Total number of tokens with Hindi (Devanagari) characters: {total_devanagari_tokens}")



devanagari_percentage = (total_devanagari_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {devanagari_percentage:.2f}%")

Hindi (Devanagari) tokens and their token IDs:
Token: ‡§¶‡•ç‡§ß, Token ID: 54736
Token: ‡§ï‡•É, Token ID: 78890
Token: ‡§ï‡§∞‡•ç, Token ID: 185301
Token: ‚ñÅ‡§´‡§≤, Token ID: 193553
Token: ‡§ö‡•ç, Token ID: 43856
Token: ‡•ã‡§°‡§º, Token ID: 153934
Token: ‚ñÅ‡§ó‡§ø‡§∞, Token ID: 228929
Token: ‚ñÅ‡§ï‡•ç‡§∞, Token ID: 82046
Token: ‡§ï‡§°, Token ID: 168322
Token: ‡•ç‡§π, Token ID: 202298
Token: ‡§Ç‡§°, Token ID: 42330
Token: ‚ñÅ‡§™‡§π‡•Å‡§Ç‡§ö, Token ID: 165395
Token: ‚ñÅ‡§Ö‡§™‡§®‡•Ä, Token ID: 86875
Token: ‚ñÅ‡§ï‡•â, Token ID: 152273
Token: ‡§∞‡•ç‡§ö, Token ID: 198274
Token: ‡§¨‡•Ä, Token ID: 91803
Token: ‡•í, Token ID: 246969
Token: ‡•á‡§§‡•ç‡§∞, Token ID: 85977
Token: ‚ñÅ‡§™‡•ç‡§∞‡§ï‡§æ‡§∞, Token ID: 107850
Token: ‚ñÅ‡§ï‡§∞, Token ID: 10494
Token: ‡§∑‡•ç‡§ü, Token ID: 52115
Token: ‡§ø‡§≤‡•ç‡§Æ, Token ID: 150305
Token: ‚ñÅ‡§¨‡•à, Token ID: 100581
Token: ‡§¢‡§º, Token ID: 48240
Token: ‡•≤, Token ID: 250047
Token: ‡§æ‡§®‡•Ä, Token ID: 59607
Token: ‚ñÅ‡§∞‡§æ‡§∑‡•ç‡§ü, Token ID: 165826
Token

In [None]:
unicodedata.name('‡§Ü')

'DEVANAGARI LETTER AA'

In [None]:
test_token = "‡§∞‡§æ‡§ú‡•ç‡§Ø" #Raajy - State
for char in test_token:
  print (char)
  print (unicodedata.name(char))

‡§∞
DEVANAGARI LETTER RA
‡§æ
DEVANAGARI VOWEL SIGN AA
‡§ú
DEVANAGARI LETTER JA
‡•ç
DEVANAGARI SIGN VIRAMA
‡§Ø
DEVANAGARI LETTER YA


## Tamil Token Explorer

https://en.wikipedia.org/wiki/Tamil_(Unicode_block)

In [None]:
import unicodedata

# Tamil Unicode block range
tamil_start = 0x0B80
tamil_end = 0x0BFF

# Iterate through the vocabulary and identify tokens with Tamil characters
tamil_tokens = []
for token, token_id in vocab.items():
    if any(tamil_start <= ord(char) <= tamil_end for char in token):
        tamil_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "TAMIL" in name:
                    tamil_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Tamil characters
total_tamil_tokens = len(tamil_tokens)


print("Tamil tokens and their token IDs:")
for token, token_id in tamil_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Tamil tokens with their token IDs
print(f"Total number of tokens with Tamil characters: {total_tamil_tokens}")


tamil_percentage = (total_tamil_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {tamil_percentage:.2f}%")

Tamil tokens and their token IDs:
Token: ‡Øá‡Æ©‡Øç, Token ID: 218728
Token: ‡Æµ‡Øç, Token ID: 209767
Token: ‚ñÅ‡Æ∞, Token ID: 150297
Token: ‚ñÅ‡Æ™‡Øá, Token ID: 182188
Token: ‡Ø™, Token ID: 250066
Token: ‡ÆÆ‡Øà‡ÆØ, Token ID: 223887
Token: ‚ñÅ‡Æâ, Token ID: 48645
Token: ‚ñÅ‡Æï‡Øá, Token ID: 224494
Token: ‡Æø‡Æï‡Æ≥‡Øç, Token ID: 178507
Token: ‡Æü‡Øç‡Æü‡ØÅ, Token ID: 88862
Token: ‡Æø‡Æï‡Øç‡Æï, Token ID: 231128
Token: ‡ÆÆ‡Ææ, Token ID: 130871
Token: ‡Æµ‡ØÅ‡ÆÆ‡Øç, Token ID: 155722
Token: ‡Æ™‡ØÜ, Token ID: 218001
Token: ‡Æ£‡Øà, Token ID: 185623
Token: ‚ñÅ‡Æ™‡ØÅ, Token ID: 131746
Token: ‚ñÅ‡Æö‡ØÜ‡ÆØ, Token ID: 161268
Token: ‚ñÅ‡Æö‡ØÜ, Token ID: 73797
Token: ‡Æ§‡Øç‡Æ§‡ØÅ, Token ID: 70241
Token: ‡Øç‡Æï‡Øç‡Æï, Token ID: 132724
Token: ‡Æ®‡Øç‡Æ§, Token ID: 29835
Token: ‚ñÅ‡Æ§‡Æø‡Æ∞‡ØÅ, Token ID: 206193
Token: ‡Æü‡Øç‡Æö, Token ID: 194142
Token: ‚ñÅ‡ÆÆ‡Ææ, Token ID: 152837
Token: ‚ñÅ‡Æ®‡ØÄ, Token ID: 154024
Token: ‡Ææ‡Æµ, Token ID: 104251
Token: ‡ØÄ, Token ID: 238411
Token: ‚ñÅ‡Æú, Token ID: 185205


In [None]:
unicodedata.name('‡Æá')

'TAMIL LETTER I'

## Chinese, Japanese, Korean (CJK) Unicode Block

https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)

In [None]:
import unicodedata

# Chinese CJK Unicode block range
cjk_start = 0x4E00
cjk_end = 0x9FFF

# Iterate through the vocabulary and identify tokens with Chinese CJK characters
cjk_tokens = []
for token, token_id in vocab.items():
    if any(cjk_start <= ord(char) <= cjk_end for char in token):
        cjk_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "CJK" in name:
                    cjk_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Chinese CJK characters
total_cjk_tokens = len(cjk_tokens)


print("Chinese CJK tokens and their token IDs:")
for token, token_id in cjk_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Chinese CJK tokens with their token IDs
print(f"Total number of tokens with Chinese CJK characters: {total_cjk_tokens}")

cjk_percentage = (total_cjk_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {cjk_percentage:.2f}%")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Token: ‰∏çËÉΩ‰∏∫Á©∫, Token ID: 125191
Token: ÁöÑÊÉÖÊ≥Å, Token ID: 184650
Token: ‰∏ÄÂ£∞, Token ID: 47905
Token: ‚ñÅÈÇÑ, Token ID: 200101
Token: Á¨¨, Token ID: 235692
Token: ÂÖ∂‰∏≠, Token ID: 28133
Token: Áç∞, Token ID: 248543
Token: Êë∂, Token ID: 255183
Token: Èù¢Á©ç, Token ID: 141638
Token: Âπ¥„ÇÇ, Token ID: 176874
Token: ÊâÄ‰ª•, Token ID: 11875
Token: Èòü‰ºç, Token ID: 96177
Token: Á®ø, Token ID: 237142
Token: ‚ñÅÊñ∞, Token ID: 12885
Token: ‚ñÅÂπ≥Âè∞, Token ID: 155636
Token: Êïç, Token ID: 248254
Token: ÂØÜÁ†Å, Token ID: 41603
Token: Ë≥û, Token ID: 237514
Token: Èµ∫, Token ID: 249676
Token: Ô®Å, Token ID: 254459
Token: ÂõûÊÜ∂, Token ID: 219568
Token: ÁîüÁêÜ, Token ID: 129031
Token: ‚ñÅÂàÜ, Token ID: 20343
Token: Â©¥ÂÑø, Token ID: 104189
Token: Ëµü, Token ID: 249226
Token: ÈÅãË°å, Token ID: 166802
Token: Áé©ÂÆ∂, Token ID: 34673
Token: Âç©, Token ID: 254123
Token: Ë≤≥, Token ID: 243706
Token: Á°º, Token ID: 245620
Token: 

In [None]:
unicodedata.name('Ê±â')

'CJK UNIFIED IDEOGRAPH-6C49'