In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", src_lang="te_IN", use_fast=False)

The IndicBART tokenizer is based on SentencePiece, but instead of standard subword units like BPE or WordPiece, it's trained at the Unicode character level, where each token is often a:
- Standalone consonant (క, గ, త, etc.)
- Vowel sign or diacritic (ా, ి, ీ, etc.)
- Word boundary marker (▁ for whitespace)

This is intentional for Indic scripts because:

- Indic languages are highly agglutinative, and subword segmentation can be noisy.
- It's better to model individual aksharas (syllables) or character+diacritic units instead of full words or arbitrary subwords.

In [9]:
vocab = tokenizer.get_vocab()
telugu_tokens = [tok for tok in vocab.keys() if any('\u0C00' <= ch <= '\u0C7F' for ch in tok)]

In [11]:
print(telugu_tokens)

['్', 'ు', 'ల', 'ర', 'ి', 'స', 'ా', 'ె', '౼', 'త', 'గ', 'క', 'ట', 'న', 'ం', 'ప', 'య', 'ో', 'మ', 'ద', 'బ', 'ీ', 'డ', 'ూ', 'వ', 'జ', 'ొ', 'హ', 'ై', 'చ', 'ే', 'ఒ', '౹', 'అ', 'ష', 'ఆ', 'ఎ', 'ఫ', 'శ', 'ఉ', 'ఇ', 'ణ', '౾']
