1. Load the Dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

2. Data Preprocessing

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer

# Choose a tokenizer that matches the model we will use
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ROMANCE")


# Tokenize the datasets
def tokenize_function(examples):
    inputs = examples['rm']
    targets = examples['bn']
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding="max_length"  # Ensure padding to the max length
    )
    labels = tokenizer(
        targets, 
        max_length=128, 
        truncation=True, 
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

3. Select a Model

In [None]:
from transformers import MBartForConditionalGeneration

# Load the pre-trained mBART model
model = MBartForConditionalGeneration.from_pretrained("google/byt5-small")

4. Train the Model

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()