In [3]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", src_lang="te_IN", use_fast=False)

# vocab = tokenizer.get_vocab()
# telugu_tokens = [tok for tok in vocab.keys() if any('\u0C00' <= ch <= '\u0C7F' for ch in tok)]

# print(telugu_tokens)

The IndicBART tokenizer is based on SentencePiece, but instead of standard subword units like BPE or WordPiece, it's trained at the Unicode character level, where each token is often a:
- Standalone consonant (క, గ, త, etc.)
- Vowel sign or diacritic (ా, ి, ీ, etc.)
- Word boundary marker (▁ for whitespace)

This is intentional for Indic scripts because:

- Indic languages are highly agglutinative, and subword segmentation can be noisy.
- It's better to model individual aksharas (syllables) or character+diacritic units instead of full words or arbitrary subwords.

In [None]:
import pandas as pd
import torch

from transformers import MT5Tokenizer, MT5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

In [9]:
model_name = "google/mt5-small"  # or "t5-base", "t5-large"
tokenizer = MT5Tokenizer.from_pretrained(model_name) # loads the tokenizer from hugging face
model = MT5ForConditionalGeneration.from_pretrained(model_name) # load the weights of the model from hugging face

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
data = pd.read_csv(r"./Datasets/SamantarDatasetWithDirectConsonantSubstitutions.csv")
print(data.columns)

Index(['OriginalText', 'AverageErrorsPerWord_0.15%', '15%_ErrorInducedText',
       'AverageErrorsPerWord_0.25%', '25%_ErrorInducedText',
       'AverageErrorsPerWord_0.35%', '35%_ErrorInducedText',
       'AverageErrorsPerWord_0.5%', '50%_ErrorInducedText'],
      dtype='object')


In [2]:
data = data[['OriginalText', '15%_ErrorInducedText']].rename(columns={'OriginalText': 'input', '15%_ErrorInducedText': 'target'})
data = data.to_dict(orient='records')

In [20]:
def preprocess(text):
    input_enc = tokenizer(text["input"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    target_enc = tokenizer(text["target"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

tokenized_data = [preprocess(item) for item in data]

In [None]:
class T5Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    # 
    def __getitem__(self, idx):
        item = {k: v.squeeze() for k, v in self.data[idx].items()}
        return item

    def __len__(self):
        return len(self.data)

train_dataset = T5Dataset(tokenized_data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

model.train()
for epoch in range(1):  # choose your epoch count
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item():.4f}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Loss: 58.3369
Loss: 62.2945
Loss: 58.1168
Loss: 58.3877
Loss: 56.9749
Loss: 60.8225
Loss: 53.8074
Loss: 61.1008
Loss: 62.5057
Loss: 56.7682
Loss: 57.3463
Loss: 54.6995
Loss: 51.3685


KeyboardInterrupt: 