In [1]:
# 1. Install Necessary Libraries
# Note: Skip this step if already installed
# !pip install datasets transformers sentencepiece torch --upgrade

# 2. Set PyTorch CUDA Allocation Configuration (Optional)
# This helps in avoiding memory fragmentation.
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# 3. Import Libraries
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
import gc

# 4. Define a Function to Clear GPU Memory
def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
    print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0)/1024**3:.2f} GB")

# 5. Clear Memory Before Starting
clear_memory()

# 6. Load the Dataset
# Ensure that "SKNahin/bengali-transliteration-data" is the correct dataset identifier
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Inspect the dataset structure
print("Dataset Structure:", dataset)
print("Features:", dataset['train'].features)
print("First Example:", dataset['train'][0])

# 7. Split the Dataset into Training and Validation Subsets
# Using a smaller split to fit GPU memory constraints
train_test_split = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Limit the dataset size based on your resources
# Here, using 1000 for training and 200 for validation
train_size = min(1000, len(train_dataset))
val_size = min(200, len(val_dataset))

train_dataset = train_dataset.select(range(train_size))
val_dataset = val_dataset.select(range(val_size))

# 8. Filter Out Rows with Empty Inputs or Outputs
def filter_empty_examples(example):
    return len(example["rm"].strip()) > 0 and len(example["bn"].strip()) > 0

train_dataset = train_dataset.filter(filter_empty_examples)
val_dataset = val_dataset.filter(filter_empty_examples)

# Print dataset sizes after filtering
print(f"Train dataset size after filtering: {len(train_dataset)}")
print(f"Validation dataset size after filtering: {len(val_dataset)}")

# 9. Load the Tokenizer and Model
# Using mBART-50 for better multilingual support
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# 10. Enable Gradient Checkpointing (Optional)
# This can save memory by recomputing certain layers during the backward pass
model.gradient_checkpointing_enable()

# 11. Set Source and Target Language Codes
source_lang = "en_XX"  # Since Banglish is similar to English script
target_lang = "ben_XX"  # Bengali

tokenizer.src_lang = source_lang

# 12. Data Preprocessing Function
def preprocess_function(examples):
    inputs = examples["rm"]  # Banglish
    targets = examples["bn"]  # Bengali
    
    # Add language codes and task prefix to inputs
    inputs = [f"translate en to ben: {text}" for text in inputs]
    
    # Tokenize the inputs (Banglish) with dynamic padding
    model_inputs = tokenizer(
        inputs,
        max_length=64,        # Reduced max_length from 128 to 64
        truncation=True,
        padding=True          # Dynamic padding to the longest sequence in the batch
    )
    
    # Tokenize the targets (Bengali) with dynamic padding
    labels = tokenizer(
        targets,
        max_length=64,        # Reduced max_length from 128 to 64
        truncation=True,
        padding=True          # Dynamic padding to the longest sequence in the batch
    )
    
    # Replace padding token id's in labels by -100 so they are ignored by the loss
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_ids]
        for labels_ids in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 13. Apply the Preprocessing to the Datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["rm", "bn"])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=["rm", "bn"])

# 14. Verify Preprocessed Samples
# Inspect a few preprocessed samples to ensure correctness
print("\nSample Preprocessed Training Examples:")
for i in range(3):
    print(f"\nSample {i+1}:")
    print("Input IDs:", train_dataset[i]['input_ids'])
    print("Labels:", train_dataset[i]['labels'])
    print("Decoded Input:", tokenizer.decode(train_dataset[i]['input_ids'], skip_special_tokens=True))
    decoded_labels = [label for label in train_dataset[i]['labels'] if label != -100]
    print("Decoded Label:", tokenizer.decode(decoded_labels, skip_special_tokens=True))

# 15. Create a Data Collator
# This ensures dynamic padding during training and evaluation
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, padding=True)

# 16. Configure Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",              # Evaluation strategy set to 'epoch'
    save_strategy="epoch",              # Save strategy aligned with eval_strategy
    learning_rate=3e-5,                 # A typical starting point
    per_device_train_batch_size=2,      # Further reduced batch size
    per_device_eval_batch_size=2,
    num_train_epochs=5,                  # Reduced epochs for faster training
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,                          # Use mixed precision if supported
    logging_dir='./logs',
    logging_steps=100,                  # Log every 100 steps
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none",                    # Disable reporting to prevent warnings
    gradient_accumulation_steps=8,       # Increased gradient accumulation steps to maintain effective batch size
)

# 17. Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,         # Use the data collator with dynamic padding
)

# 18. Clear Memory Before Training
clear_memory()

# 19. Train the Model
trainer.train()

# 20. Save the Model and Tokenizer
trainer.save_model("./banglish_to_bangla_model")
tokenizer.save_pretrained("./banglish_to_bangla_model")

# 21. Define the Translation Function
def translate_text(input_text, model, tokenizer):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        # Prepare the input with the appropriate language code and task prefix
        input_text = f"translate en to ben: {input_text}"
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=64,          # Match the reduced max_length
            truncation=True,
            padding=True
        ).to(device)
        
        # Generate the output
        outputs = model.generate(
            **inputs,
            max_length=64,          # Match the reduced max_length
            num_beams=5,
            early_stopping=True
        )
        
        # Decode the output
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text

# 22. Test the Model with Sample Inputs
# Define a list of sample Banglish sentences
sample_sentences = [
    "ami bhalo achi",
    "tumi kemon acho",
    "tara bazar jachhe",
    "amar naam Iqbal",
    "ami school jabo",
    "tader bari kothay?",
    "aaj amar birthday",
    "ami ekta book porchi",
    "shey bhai bole",
    "amader desh sundor",
    "tumi ki khaccho?",
    "ami cinema dekhte jabo",
    "ajker din ta boro kothin",
    "ami tomake bhalobashi",
    "amar bhai doctor"
]

# Translate each sample sentence and print the results
print("\nSample Translations:")
for sentence in sample_sentences:
    translated = translate_text(sentence, model, tokenizer)
    print(f"Banglish: {sentence}\nBengali: {translated}\n")


GPU Memory Allocated: 0.00 GB
GPU Memory Reserved: 0.00 GB


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Dataset Structure: DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})
Features: {'bn': Value(dtype='string', id=None), 'rm': Value(dtype='string', id=None)}
First Example: {'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???'}


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Train dataset size after filtering: 1000
Validation dataset size after filtering: 200


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
