In [1]:

!pip install transformers datasets evaluate




# Import Libraries

In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from datasets import load_dataset, DatasetDict
import evaluate
import numpy as np
import random


# Load and Explore the Dataset

In [3]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


In [5]:
print(dataset.keys())

dict_keys(['train'])


In [6]:
if 'train' in dataset:
    split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
    dataset = DatasetDict({
        'train': split_dataset['train'],
        'validation': split_dataset['test']
    })
    print(dataset)
else:
    raise ValueError("Dataset does not have a 'train' split.")

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 4004
    })
    validation: Dataset({
        features: ['bn', 'rm'],
        num_rows: 1002
    })
})


# Data Preprocessing

### Inspect dataset

In [7]:
print("Training Example:")
print(f"Bangla (bn): {dataset['train'][0]['bn']}")
print(f"Banglish (rm): {dataset['train'][0]['rm']}")

Training Example:
Bangla (bn): এটা কোনো পোস্ট হলো মিয়া আবাল
Banglish (rm): eta kono post holo mia abal


In [8]:
print("\nValidation Example:")
print(f"Bangla (bn): {dataset['validation'][0]['bn']}")
print(f"Banglish (rm): {dataset['validation'][0]['rm']}")


Validation Example:
Bangla (bn): ভালো করে ট্রাই করেন পাবেন..
Banglish (rm): valo kore trai koren paben..


### Clean data

In [9]:
def filter_examples(example, min_length=5, max_length=100):
    return (
        min_length <= len(example['rm'].split()) <= max_length and
        min_length <= len(example['bn'].split()) <= max_length
    )

dataset = dataset.filter(filter_examples)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 2651
    })
    validation: Dataset({
        features: ['bn', 'rm'],
        num_rows: 666
    })
})


### Tokenization

In [10]:
source_lang = "en_XX"  # Represents Banglish (similar to English)
target_lang = "bn_IN"  # Bengali

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")


In [11]:
def preprocess_function(examples):
    inputs = [f"{source_lang} {text}" for text in examples['rm']]
    targets = [text for text in examples['bn']]
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [12]:
tokenized_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)


In [13]:
print(tokenized_dataset['train'][0])

{'input_ids': tensor([250004, 250004,    522,  80819,   1305, 136346,   8132,     10,   3571,
             2,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,   

# Model Selection

**Choosing the Model: mBART**

Justification:

- **Performance:** mBART is a multilingual sequence-to-sequence model pre-trained for various translation tasks, making it suitable for transliteration.
- **Suitability for Low-Resource Languages:** It supports Bengali (bn_BN) and has been trained on multiple languages, facilitating better generalization.
- **Efficiency:** Balances performance with computational requirements, making it feasible to train on platforms like Google Colab.


In [14]:

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")


model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id


# Training the Model

In [15]:
# Define training arguments with reduced batch size and gradient accumulation
training_args = Seq2SeqTrainingArguments(
    output_dir="./banglish-to-bangla",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,  # Further reduced batch size
    per_device_eval_batch_size=2,   # Further reduced batch size
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,  # Use mixed precision if GPU supports
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
)




In [16]:
# Load BLEU metric using the evaluate library
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Split into words
    decoded_preds = [pred.split() for pred in decoded_preds]
    decoded_labels = [[label.split()] for label in decoded_labels]

    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}


In [17]:
# Initialize the Trainer with compute_metrics
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [None]:

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbsse1307[0m ([33mbsse1307-university-of-dhaka[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


In [1]:
# =============================
# Step 1: Setup and Installation
# =============================

# Suppress TensorFlow warnings (optional, since we're using PyTorch)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings

# Install necessary libraries
!pip install --upgrade transformers datasets evaluate

# Import libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from datasets import load_dataset, DatasetDict
import evaluate
import numpy as np
import random

# =============================
# Step 2: Load and Explore the Dataset
# =============================

# Load the dataset from Hugging Face
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Inspect the dataset
print("Dataset Structure:")
print(dataset)

# Check available splits
print("\nAvailable Splits:")
print(dataset.keys())

# Split the dataset into training and validation if only 'train' exists
if 'train' in dataset:
    # **Optimization 1: Limit the dataset size for faster training**
    # Adjust `num_train_samples` and `num_val_samples` based on your requirements and hardware
    num_train_samples = 100  # Reduced to 100 samples for training
    num_val_samples = 25     # Reduced to 25 samples for validation
    total_samples = num_train_samples + num_val_samples

    # Shuffle and select the required number of samples
    dataset['train'] = dataset['train'].shuffle(seed=42).select(range(total_samples))

    # Split into training and validation
    split_dataset = dataset['train'].train_test_split(test_size=num_val_samples, seed=42)

    # Create a DatasetDict with train and validation
    dataset = DatasetDict({
        'train': split_dataset['train'],
        'validation': split_dataset['test']
    })
    print(f"\nAfter Splitting and Limiting to {total_samples} samples:")
    print(dataset)
else:
    raise ValueError("Dataset does not have a 'train' split.")

# Inspect sample data
print("\nSample Training Example:")
print(f"Bangla (bn): {dataset['train'][0]['bn']}")
print(f"Banglish (rm): {dataset['train'][0]['rm']}")

print("\nSample Validation Example:")
print(f"Bangla (bn): {dataset['validation'][0]['bn']}")
print(f"Banglish (rm): {dataset['validation'][0]['rm']}")

# =============================
# Step 3: Data Preprocessing
# =============================

# Define a filtering function to remove too short or too long examples
def filter_examples(example, min_length=5, max_length=100):
    return (
        min_length <= len(example['rm'].split()) <= max_length and
        min_length <= len(example['bn'].split()) <= max_length
    )

# Apply filtering to both training and validation sets
dataset = dataset.filter(filter_examples)
print("\nAfter Filtering:")
print(dataset)

# Define source and target language codes for mBART
source_lang = "en_XX"  # Represents Banglish (similar to English)
target_lang = "bn_IN"  # Correct language code for Bengali

# Load the T5-small tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Verify available language codes (not applicable for T5, but keeping for consistency)
print("\nAvailable Language Codes:")
print(tokenizer.lang_code_to_id.keys()) if hasattr(tokenizer, 'lang_code_to_id') else print("No language codes available for T5.")

# Add language codes to the source texts and tokenize
def preprocess_function(examples):
    # T5 uses task prefixes; we'll use "transliterate: " as the prefix
    inputs = [f"transliterate: {text}" for text in examples['rm']]
    targets = [text for text in examples['bn']]
    model_inputs = tokenizer(
        inputs,
        max_length=32,  # Reduced from 64 to save memory
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=32,  # Reduced from 64 to save memory
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names  # Remove original columns to save memory
)

# Set the format for PyTorch tensors
tokenized_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

# Inspect a tokenized training example
print("\nSample Tokenized Training Example:")
print(tokenized_dataset['train'][0])

# =============================
# Step 4: Model Selection
# =============================

# Load the pre-trained T5-small model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set decoder start token, EOS token, and pad token if applicable (T5 uses eos_token_id and pad_token_id)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# =============================
# Step 5: Training the Model
# =============================

# Clear CUDA cache before training
torch.cuda.empty_cache()

# **Removed Freezing of Model Layers**
# Allow all layers to be trainable to avoid issues with frozen parameters

# Define training arguments with gradient accumulation
training_args = Seq2SeqTrainingArguments(
    output_dir="./banglish-to-bangla",
    eval_strategy="epoch",  # Updated from 'evaluation_strategy' to 'eval_strategy'
    per_device_train_batch_size=1,  # **Optimization A: Reduced batch size to 1**
    per_device_eval_batch_size=1,   # **Optimization A: Reduced eval batch size to 1**
    gradient_accumulation_steps=1,  # **Optimization B: Gradient accumulation steps set to 1**
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,             # **Optimization C: Limit to 1 checkpoint**
    num_train_epochs=1,             # **Optimization D: Reduced number of epochs to 1**
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=50,               # **Optimization E: Logging every 50 steps**
    fp16=torch.cuda.is_available(),  # **Optimization F: Use mixed precision if GPU supports**
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
)

# Load BLEU metric using the evaluate library
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Prepare references for BLEU (do not split into words)
    decoded_labels = [[label] for label in decoded_labels]

    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

# Initialize the Trainer with compute_metrics
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,  # This will trigger a deprecation warning in future versions
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# =============================
# Step 6: Evaluation and Inference
# =============================

# Evaluate the model
results = trainer.evaluate()
print("\nEvaluation Results:")
print(results)

# Define a function for transliteration
def transliterate(text):
    model.eval()
    inputs = tokenizer(
        f"transliterate: {text}",
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=32  # Consistent with training max_length
    ).to(device)  # Move inputs to the same device as the model
    with torch.no_grad():
        generated_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=32,
            num_beams=4,
            early_stopping=True
        )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Example usage
banglish_text = "ami tomake bhalobashi"
bangla_text = transliterate(banglish_text)
print(f"\nBanglish: {banglish_text}")
print(f"Bangla: {bangla_text}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})

Available Splits:
dict_keys(['train'])

After Splitting and Limiting to 125 samples:
DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['bn', 'rm'],
        num_rows: 25
    })
})

Sample Training Example:
Bangla (bn): এডিট করা শিখাইতেন। তখন থেকে আপনাকে ফলো করতেছি।এখনো করছি।
Banglish (rm): edit kora shikhaiten. Tokhon theke apnake follow korteci.ekhono korci.

Sample Validation Example:
Bangla (bn): হতে পারে বাট আমি ভিডমেট ইউজার
Banglish (rm): Hote pare but ami vidmate user 


Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]


After Filtering:
DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 71
    })
    validation: Dataset({
        features: ['bn', 'rm'],
        num_rows: 16
    })
})


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]


Available Language Codes:
No language codes available for T5.


Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]


Sample Tokenized Training Example:
{'input_ids': tensor([ 3017,  9842,   342,    10,  4777,     3,  5543,     9,     3,  5605,
        18276,   155,    35,     5,   304, 28569,    29,     8,  1050,     3,
            9,   102,    29,     9,  1050,  1130,     3,  5543,  5822,    23,
            5,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])}


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbsse1307[0m ([33mbsse1307-university-of-dhaka[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bleu
1,3.3197,0.744348,0.0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Evaluation Results:
{'eval_loss': 0.744348406791687, 'eval_bleu': 0.0, 'eval_runtime': 4.5487, 'eval_samples_per_second': 3.518, 'eval_steps_per_second': 3.518, 'epoch': 1.0}

Banglish: ami tomake bhalobashi
Bangla: 
