In [6]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load the .env file
load_dotenv()

# Get the HF token from the .env file
hf_token = os.getenv("HF_TOKEN")

# Login to Hugging Face
login(token=hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/sagdesai/.cache/huggingface/token
Login successful


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

In [8]:
# Get the vocabulary
vocab = tokenizer.get_vocab()


total_vocab_size = len(vocab)

# Print the total number of tokens
print(f"Total number of tokens: {total_vocab_size}")


Total number of tokens: 256000


#### Print first 500 and last 500 tokens from the tokenizer

In [9]:
# Sort the vocabulary by token ID
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

# Print the first 500 tokens and their token IDs
print("First 500 tokens and their token IDs:")
for token, token_id in sorted_vocab[:500]:
    print(f"Token: {token}, Token ID: {token_id}")

First 500 tokens and their token IDs:
Token: <pad>, Token ID: 0
Token: <eos>, Token ID: 1
Token: <bos>, Token ID: 2
Token: <unk>, Token ID: 3
Token: <mask>, Token ID: 4
Token: <2mass>, Token ID: 5
Token: [@BOS@], Token ID: 6
Token: <unused0>, Token ID: 7
Token: <unused1>, Token ID: 8
Token: <unused2>, Token ID: 9
Token: <unused3>, Token ID: 10
Token: <unused4>, Token ID: 11
Token: <unused5>, Token ID: 12
Token: <unused6>, Token ID: 13
Token: <unused7>, Token ID: 14
Token: <unused8>, Token ID: 15
Token: <unused9>, Token ID: 16
Token: <unused10>, Token ID: 17
Token: <unused11>, Token ID: 18
Token: <unused12>, Token ID: 19
Token: <unused13>, Token ID: 20
Token: <unused14>, Token ID: 21
Token: <unused15>, Token ID: 22
Token: <unused16>, Token ID: 23
Token: <unused17>, Token ID: 24
Token: <unused18>, Token ID: 25
Token: <unused19>, Token ID: 26
Token: <unused20>, Token ID: 27
Token: <unused21>, Token ID: 28
Token: <unused22>, Token ID: 29
Token: <unused23>, Token ID: 30
Token: <unused24>, T

In [10]:
# Print the last 500 tokens and their token IDs
print("Last 500 tokens and their token IDs:")
for token, token_id in sorted_vocab[-500:]:
    print(f"Token: {token}, Token ID: {token_id}")

Last 500 tokens and their token IDs:
Token: 푏, Token ID: 255500
Token: 푘, Token ID: 255501
Token: , Token ID: 255502
Token: , Token ID: 255503
Token: , Token ID: 255504
Token: , Token ID: 255505
Token: , Token ID: 255506
Token: , Token ID: 255507
Token: , Token ID: 255508
Token: , Token ID: 255509
Token: , Token ID: 255510
Token: , Token ID: 255511
Token: , Token ID: 255512
Token: , Token ID: 255513
Token: 料, Token ID: 255514
Token: ﺀ, Token ID: 255515
Token: ﻀ, Token ID: 255516
Token: 𑄝, Token ID: 255517
Token: 𓄹, Token ID: 255518
Token: 𓆏, Token ID: 255519
Token: 𖡼, Token ID: 255520
Token: 𝔨, Token ID: 255521
Token: 𝕫, Token ID: 255522
Token: 𝖢, Token ID: 255523
Token: 𝖿, Token ID: 255524
Token: 𝜓, Token ID: 255525
Token: 🅤, Token ID: 255526
Token: 🅺, Token ID: 255527
Token: 🦣, Token ID: 255528
Token: 🧆, Token ID: 255529
Token: Ȏ, Token ID: 255530
Token: ͽ, Token ID: 255531
Token: ϫ, Token ID: 255532
Token: Ϻ, Token ID: 255533
Token: ԍ, Token ID: 255534
Token: ۠, Token ID

## Telugu Tokenizer Exploration

https://en.wikipedia.org/wiki/Telugu_(Unicode_block)

In [11]:
# Telugu Unicode block range. Refer to the wikipedia link image for Telugu unicode block range.
telugu_start = 0x0C00
telugu_end = 0x0C7F

import unicodedata

# Iterate through the vocabulary and identify tokens with Telugu characters
telugu_tokens = []
# for token, token_id in vocab.items():
#     if any(telugu_start <= ord(char) <= telugu_end for char in token):
#         telugu_tokens.append((token, token_id))

for token, token_id in vocab.items():
    if any(telugu_start <= ord(char) <= telugu_end for char in token):
        telugu_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "TELUGU" in name:
                    telugu_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Telugu characters
total_telugu_tokens = len(telugu_tokens)


print("Telugu tokens and their token IDs:")
for token, token_id in telugu_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Telugu tokens with their token IDs
print(f"Total number of tokens with Telugu characters: {total_telugu_tokens}")

telugu_percentage = (total_telugu_tokens / total_vocab_size) * 100
print(f"Percentage of Telugu tokens in the vocabulary: {telugu_percentage:.2f}%")

Telugu tokens and their token IDs:
Token: ్ర, Token ID: 34970
Token: గ, Token ID: 237467
Token: ▁ద, Token ID: 89331
Token: ను, Token ID: 56508
Token: ▁బ, Token ID: 85990
Token: ంక, Token ID: 191046
Token: త, Token ID: 236930
Token: ▁వి, Token ID: 91918
Token: ఙ, Token ID: 254013
Token: ▁తె, Token ID: 232117
Token: ఠ, Token ID: 244359
Token: వి, Token ID: 97356
Token: ాయ, Token ID: 90966
Token: కర్య, Token ID: 197972
Token: ▁సౌకర్య, Token ID: 198656
Token: వ, Token ID: 236936
Token: ు, Token ID: 236336
Token: ని, Token ID: 37546
Token: సు, Token ID: 149344
Token: ఏ, Token ID: 241913
Token: డు, Token ID: 83296
Token: ర, Token ID: 236442
Token: ▁శ, Token ID: 126865
Token: దు, Token ID: 140322
Token: ౫, Token ID: 249546
Token: ఈ, Token ID: 241557
Token: ట, Token ID: 237499
Token: ప్ప, Token ID: 221332
Token: ౨, Token ID: 249050
Token: ంగా, Token ID: 151061
Token: ూ, Token ID: 238529
Token: ఓ, Token ID: 243752
Token: ఱ, Token ID: 247792
Token: ▁య, Token ID: 144782
Token: ము, Token ID: 84022

In [12]:
unicodedata.name('A')

'LATIN CAPITAL LETTER A'

In [13]:
unicodedata.name('ఇ')

'TELUGU LETTER I'

In [14]:
test_token = "స్త్రీ" #Woman - Sthree
for char in test_token:
  print (char)
  print (unicodedata.name(char))

స
TELUGU LETTER SA
్
TELUGU SIGN VIRAMA
త
TELUGU LETTER TA
్
TELUGU SIGN VIRAMA
ర
TELUGU LETTER RA
ీ
TELUGU VOWEL SIGN II


#### Hindi (Devanagari) Tokenizer Exploration

https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)

In [15]:
import unicodedata

# Devanagari Unicode block range
devanagari_start = 0x0900
devanagari_end = 0x097F

# Iterate through the vocabulary and identify tokens with Devanagari characters
devanagari_tokens = []
for token, token_id in vocab.items():
    if any(devanagari_start <= ord(char) <= devanagari_end for char in token):
        devanagari_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "DEVANAGARI" in name:
                    devanagari_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Devanagari characters
total_devanagari_tokens = len(devanagari_tokens)


print("Hindi (Devanagari) tokens and their token IDs:")
for token, token_id in devanagari_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Devanagari tokens with their token IDs
print(f"Total number of tokens with Hindi (Devanagari) characters: {total_devanagari_tokens}")



devanagari_percentage = (total_devanagari_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {devanagari_percentage:.2f}%")

Hindi (Devanagari) tokens and their token IDs:
Token: द्ध, Token ID: 54736
Token: कृ, Token ID: 78890
Token: कर्, Token ID: 185301
Token: ▁फल, Token ID: 193553
Token: च्, Token ID: 43856
Token: ोड़, Token ID: 153934
Token: ▁गिर, Token ID: 228929
Token: ▁क्र, Token ID: 82046
Token: कड, Token ID: 168322
Token: ्ह, Token ID: 202298
Token: ंड, Token ID: 42330
Token: ▁पहुंच, Token ID: 165395
Token: ▁अपनी, Token ID: 86875
Token: ▁कॉ, Token ID: 152273
Token: र्च, Token ID: 198274
Token: बी, Token ID: 91803
Token: ॒, Token ID: 246969
Token: ेत्र, Token ID: 85977
Token: ▁प्रकार, Token ID: 107850
Token: ▁कर, Token ID: 10494
Token: ष्ट, Token ID: 52115
Token: िल्म, Token ID: 150305
Token: ▁बै, Token ID: 100581
Token: ढ़, Token ID: 48240
Token: ॲ, Token ID: 250047
Token: ानी, Token ID: 59607
Token: ▁राष्ट, Token ID: 165826
Token: ोड, Token ID: 62331
Token: ▁खु, Token ID: 99919
Token: ृत, Token ID: 187537
Token: ंत्र, Token ID: 79209
Token: ष्ण, Token ID: 226458
Token: री, Token ID: 19105
Token: ▁द

In [None]:
unicodedata.name('आ')

'DEVANAGARI LETTER AA'

In [None]:
test_token = "राज्य" #Raajy - State
for char in test_token:
  print (char)
  print (unicodedata.name(char))

र
DEVANAGARI LETTER RA
ा
DEVANAGARI VOWEL SIGN AA
ज
DEVANAGARI LETTER JA
्
DEVANAGARI SIGN VIRAMA
य
DEVANAGARI LETTER YA


## Tamil Token Explorer

https://en.wikipedia.org/wiki/Tamil_(Unicode_block)

In [None]:
import unicodedata

# Tamil Unicode block range
tamil_start = 0x0B80
tamil_end = 0x0BFF

# Iterate through the vocabulary and identify tokens with Tamil characters
tamil_tokens = []
for token, token_id in vocab.items():
    if any(tamil_start <= ord(char) <= tamil_end for char in token):
        tamil_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "TAMIL" in name:
                    tamil_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Tamil characters
total_tamil_tokens = len(tamil_tokens)


print("Tamil tokens and their token IDs:")
for token, token_id in tamil_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Tamil tokens with their token IDs
print(f"Total number of tokens with Tamil characters: {total_tamil_tokens}")


tamil_percentage = (total_tamil_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {tamil_percentage:.2f}%")

Tamil tokens and their token IDs:
Token: ேன், Token ID: 218728
Token: வ், Token ID: 209767
Token: ▁ர, Token ID: 150297
Token: ▁பே, Token ID: 182188
Token: ௪, Token ID: 250066
Token: மைய, Token ID: 223887
Token: ▁உ, Token ID: 48645
Token: ▁கே, Token ID: 224494
Token: ிகள், Token ID: 178507
Token: ட்டு, Token ID: 88862
Token: ிக்க, Token ID: 231128
Token: மா, Token ID: 130871
Token: வும், Token ID: 155722
Token: பெ, Token ID: 218001
Token: ணை, Token ID: 185623
Token: ▁பு, Token ID: 131746
Token: ▁செய, Token ID: 161268
Token: ▁செ, Token ID: 73797
Token: த்து, Token ID: 70241
Token: ்க்க, Token ID: 132724
Token: ந்த, Token ID: 29835
Token: ▁திரு, Token ID: 206193
Token: ட்ச, Token ID: 194142
Token: ▁மா, Token ID: 152837
Token: ▁நீ, Token ID: 154024
Token: ாவ, Token ID: 104251
Token: ீ, Token ID: 238411
Token: ▁ஜ, Token ID: 185205
Token: மை, Token ID: 100228
Token: ப், Token ID: 21484
Token: வர், Token ID: 131100
Token: ▁என்று, Token ID: 194913
Token: சிய, Token ID: 221937
Token: ்கள, Token

In [None]:
unicodedata.name('இ')

'TAMIL LETTER I'

## Chinese, Japanese, Korean (CJK) Unicode Block

https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)

In [None]:
import unicodedata

# Chinese CJK Unicode block range
cjk_start = 0x4E00
cjk_end = 0x9FFF

# Iterate through the vocabulary and identify tokens with Chinese CJK characters
cjk_tokens = []
for token, token_id in vocab.items():
    if any(cjk_start <= ord(char) <= cjk_end for char in token):
        cjk_tokens.append((token, token_id))
    else:
        for char in token:
            try:
                name = unicodedata.name(char)
                if "CJK" in name:
                    cjk_tokens.append((token, token_id))
                    break
            except ValueError:
                pass

# Count the total number of tokens with Chinese CJK characters
total_cjk_tokens = len(cjk_tokens)


print("Chinese CJK tokens and their token IDs:")
for token, token_id in cjk_tokens:
    print(f"Token: {token}, Token ID: {token_id}")

# Print the total count and the Chinese CJK tokens with their token IDs
print(f"Total number of tokens with Chinese CJK characters: {total_cjk_tokens}")

cjk_percentage = (total_cjk_tokens / total_vocab_size) * 100
print(f"Percentage of Devanagari tokens in the vocabulary: {cjk_percentage:.2f}%")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Token: 不能为空, Token ID: 125191
Token: 的情況, Token ID: 184650
Token: 一声, Token ID: 47905
Token: ▁還, Token ID: 200101
Token: 第, Token ID: 235692
Token: 其中, Token ID: 28133
Token: 獰, Token ID: 248543
Token: 摶, Token ID: 255183
Token: 面積, Token ID: 141638
Token: 年も, Token ID: 176874
Token: 所以, Token ID: 11875
Token: 队伍, Token ID: 96177
Token: 稿, Token ID: 237142
Token: ▁新, Token ID: 12885
Token: ▁平台, Token ID: 155636
Token: 敍, Token ID: 248254
Token: 密码, Token ID: 41603
Token: 賞, Token ID: 237514
Token: 鵺, Token ID: 249676
Token: 度, Token ID: 254459
Token: 回憶, Token ID: 219568
Token: 生理, Token ID: 129031
Token: ▁分, Token ID: 20343
Token: 婴儿, Token ID: 104189
Token: 赟, Token ID: 249226
Token: 運行, Token ID: 166802
Token: 玩家, Token ID: 34673
Token: 卩, Token ID: 254123
Token: 貳, Token ID: 243706
Token: 硼, Token ID: 245620
Token: 殺人, Token ID: 147709
Token: 烁, Token ID: 241468
Token: 的确, Token ID: 103294
Token: 獨, Token ID: 238813
T

In [None]:
unicodedata.name('汉')

'CJK UNIFIED IDEOGRAPH-6C49'