# Setup

In [1]:
!pip install transformers datasets torch

import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

Defaulting to user installation because normal site-packages is not writeable


# Dataset Loading

In [2]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation sets (90/10 split)
train_test_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 4505
    })
    validation: Dataset({
        features: ['bn', 'rm'],
        num_rows: 501
    })
})


# Preprocessing

In [3]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5")

# Define a preprocessing function
def preprocess_function(examples):
    inputs = examples['rm']
    targets = examples['bn']
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        padding='max_length',
        truncation=True
    )
    labels = tokenizer(
        targets,
        max_length=128,
        padding='max_length',
        truncation=True
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

print(tokenized_datasets['train'][0])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/501 [00:00<?, ? examples/s]

{'bn': 'আপনার এফবি আইডি নেম বিশাল আর এইখানে মামুন কেন ?', 'rm': 'Apnar fb id name Bishal ar Ekhane mamun keno ?', 'input_ids': [27902, 22909, 3580, 1363, 20, 3069, 13374, 1613, 7873, 1155, 10278, 2012, 28368, 806, 4653, 15645, 29392, 245, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [171, 4598, 262,

# Model

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Training Setup

In [5]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


# Train

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


# Evaluation

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Inference

In [None]:
def transliterate(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=128, padding='max_length', truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
sample_text = "ami tomake bhalobashi"
transliterated_text = transliterate(sample_text)
print(f"Banglish: {sample_text}")
print(f"Bengali: {transliterated_text}")