In [14]:
import torch
from datasets import load_dataset
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainerCallback
)
from sklearn.model_selection import train_test_split
import os
import numpy as np
os.environ["WANDB_DISABLED"] = "true"

In [15]:

def normalize_text(text):
    """Normalize Banglish text for consistency."""
    # Example normalization rules
    text = text.lower()
    # text = text.replace("ki", "‡¶ï‡¶ø").replace("tumi", "‡¶§‡ßÅ‡¶Æ‡¶ø")  # Add more mappings
    return text


# Custom callback to track epoch-wise losses
class LossCallback(TrainerCallback):
    def __init__(self):
        self.training_losses = []
        self.eval_losses = []
        self.current_epoch_losses = []
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # print(logs)
            if 'train_loss' in logs:
                self.current_epoch_losses.append(logs['train_loss'])
            if 'eval_loss' in logs:
                self.eval_losses.append(logs['eval_loss'])
                
    def on_epoch_end(self, args, state, control, **kwargs):
        if self.current_epoch_losses:
            avg_loss = np.mean(self.current_epoch_losses)
            self.training_losses.append(avg_loss)
            print(f"\nEpoch {state.epoch}: Average Training Loss = {avg_loss:.4f}")
            if self.eval_losses:
                print(f"Epoch {state.epoch}: Validation Loss = {self.eval_losses[-1]:.4f}")
            self.current_epoch_losses = []


In [16]:

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
ds = load_dataset("SKNahin/bengali-transliteration-data")
train_val_data = ds['train']

# Take a smaller subset for initial testing
MAX_SAMPLES = 10000  # Adjust this number based on your GPU memory
banglish_texts = train_val_data['rm'][:MAX_SAMPLES]
bangla_texts = train_val_data['bn'][:MAX_SAMPLES]

banglish_texts = [normalize_text(text) for text in banglish_texts]


# Split data
train_banglish, val_banglish, train_bangla, val_bangla = train_test_split(
    banglish_texts, bangla_texts, test_size=0.1, random_state=42
)

Using device: cuda


In [17]:

# Initialize tokenizer and model
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

def preprocess_data(banglish_texts, bangla_texts):
    inputs = tokenizer(
        banglish_texts, 
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            bangla_texts,
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_ids"]
    }

# Create train and validation datasets
train_encodings = preprocess_data(train_banglish, train_bangla)
val_encodings = preprocess_data(val_banglish, val_bangla)

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create datasets
train_dataset = SimpleDataset(train_encodings)
val_dataset = SimpleDataset(val_encodings)

# Initialize model
model = BartForConditionalGeneration.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./simple-banglish-translator",
    evaluation_strategy="epoch",  # Changed to epoch to get per-epoch evaluation
    learning_rate=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=32,
    num_train_epochs=15,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
    save_strategy="epoch"  # Changed to epoch
)

# Initialize loss callback
loss_callback = LossCallback()

# Initialize trainer with callback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    callbacks=[loss_callback]  # Add the callback
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [18]:

# Training
print("Starting training...")
trainer.train()

# Print final loss summary
print("\nTraining Summary:")
for epoch, (train_loss, eval_loss) in enumerate(zip(loss_callback.training_losses, loss_callback.eval_losses), 1):
    print(f"Epoch {epoch}:")
    print(f"  Average Training Loss: {train_loss:.4f}")
    print(f"  Validation Loss: {eval_loss:.4f}")

# Save the model
model.save_pretrained("./simple-banglish-translator-final")
tokenizer.save_pretrained("./simple-banglish-translator-final")

Starting training...


Epoch,Training Loss,Validation Loss
0,No log,28.049725
1,No log,12.393654
2,No log,15.387169
3,No log,13.63045
4,No log,8.822666
5,No log,8.025673
6,No log,6.051314
7,No log,5.253895
8,No log,4.699875
9,No log,3.940466


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams


Training Summary:


('./simple-banglish-translator-final/tokenizer_config.json',
 './simple-banglish-translator-final/special_tokens_map.json',
 './simple-banglish-translator-final/vocab.json',
 './simple-banglish-translator-final/merges.txt',
 './simple-banglish-translator-final/added_tokens.json')

In [19]:
def translate_banglish_to_bengali(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=2)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


test_text = "ami tomake bhalobashi"
translated = translate_banglish_to_bengali(test_text)
print(f"\nTest Translation:")
print(f"Input: {test_text}")
print(f"Translation: {translated}")


Test Translation:
Input: ami tomake bhalobashi
Translation: ÔøΩÔøΩÔøΩÔøΩ‡¶ßÔøΩÔøΩÔøΩÔøΩÔøΩ ÔøΩ‡¶ßÔøΩÔøΩÔøΩ ‡¶æÔøΩ‡¶æ‡¶®ÔøΩ‡¶®‡¶áÔøΩ‡¶áÔøΩÔøΩ‡¶ß‡¶ßÔøΩÔøΩ


In [21]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import numpy as np
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split

# Download required NLTK data
try:
    nltk.download('punkt')
except:
    pass


def load_model_and_tokenizer(model_path="./simple-banglish-translator-final"):
    """Load the saved model and tokenizer"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    try:
        tokenizer = BartTokenizer.from_pretrained(model_path)
        model = BartForConditionalGeneration.from_pretrained(model_path)
    except:
        print("Saved model not found, loading base model...")
        tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    
    model.to(device)
    return model, tokenizer, device

def translate_text(text, model, tokenizer, device):
    """Translate a single text from Banglish to Bengali"""
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=2)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def calculate_character_accuracy(pred, target):
    """Calculate character-level accuracy"""
    correct = sum(1 for p, t in zip(pred, target) if p == t)
    total = max(len(pred), len(target))
    return correct / total if total > 0 else 0

def evaluate_model(model, tokenizer, device, test_banglish, test_bangla, num_samples=None):
    """Evaluate the model on test data"""
    bleu_scores = []
    char_accuracies = []
    predictions = []
    references = []
    
    # Initialize BLEU smoothing function
    smoothie = SmoothingFunction().method1
    
    # Process only num_samples if specified
    test_range = range(min(len(test_banglish), num_samples if num_samples else len(test_banglish)))
    
    for i in tqdm(test_range, desc="Evaluating"):
        banglish_text = test_banglish[i]
        bengali_text = test_bangla[i]
        
        # Get model prediction
        pred_bengali = translate_text(banglish_text, model, tokenizer, device)
        
        # Calculate BLEU score
        bleu = sentence_bleu(
            [bengali_text.split()],
            pred_bengali.split(),
            smoothing_function=smoothie
        )
        
        # Calculate character accuracy
        char_acc = calculate_character_accuracy(pred_bengali, bengali_text)
        
        bleu_scores.append(bleu)
        char_accuracies.append(char_acc)
        predictions.append(pred_bengali)
        references.append(bengali_text)
        
    # Calculate average scores
    avg_bleu = np.mean(bleu_scores)
    avg_char_acc = np.mean(char_accuracies)
    
    # Store some example translations
    examples = []
    for i in range(min(5, len(predictions))):
        examples.append({
            'banglish': test_banglish[i],
            'predicted': predictions[i],
            'reference': references[i],
            'bleu': bleu_scores[i],
            'char_acc': char_accuracies[i]
        })
    
    results = {
        'average_bleu': float(avg_bleu),
        'average_char_accuracy': float(avg_char_acc),
        'num_samples': len(test_range),
        'example_translations': examples
    }
    
    return results


print("Loading dataset...")

# 1. Load and Split Dataset
ds = load_dataset("SKNahin/bengali-transliteration-data")
train_val_data = ds['train']  # The dataset comes with only a train split

# Convert to list of dictionaries for easier splitting
data_dict = train_val_data.to_dict()
banglish_texts = data_dict['rm']
bangla_texts = data_dict['bn']

# Split into train and validation sets (90-10 split)
train_banglish, val_banglish, train_bangla, val_bangla = train_test_split(
    banglish_texts, bangla_texts, test_size=0.1, random_state=42
)

# Load model and tokenizer
print("Loading model...")
model, tokenizer, device = load_model_and_tokenizer()

# Evaluate
print("Starting evaluation...")
results = evaluate_model(model, tokenizer, device, val_banglish, val_bangla, num_samples=100)

# Print results
print("\nEvaluation Results:")
print(f"Number of samples evaluated: {results['num_samples']}")
print(f"Average BLEU score: {results['average_bleu']:.4f}")
print(f"Average Character Accuracy: {results['average_char_accuracy']:.4f}")

print("\nExample Translations:")
for i, example in enumerate(results['example_translations'], 1):
    print(f"\nExample {i}:")
    print(f"Banglish: {example['banglish']}")
    print(f"Predicted: {example['predicted']}")
    print(f"Reference: {example['reference']}")
    print(f"BLEU Score: {example['bleu']:.4f}")
    print(f"Character Accuracy: {example['char_acc']:.4f}")

# Save results to file
with open('evaluation_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print("\nResults have been saved to 'evaluation_results.json'")



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loading dataset...
Loading model...
Starting evaluation...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:41<00:00,  2.42it/s]


Evaluation Results:
Number of samples evaluated: 100
Average BLEU score: 0.0000
Average Character Accuracy: 0.0153

Example Translations:

Example 1:
Banglish: hoi na keno
Predicted: ÔøΩÔøΩÔøΩÔøΩ‡¶ßÔøΩÔøΩÔøΩÔøΩÔøΩ ÔøΩ‡¶æÔøΩ‡¶®ÔøΩ‡¶áÔøΩÔøΩÔøΩ‡¶ßÔøΩÔøΩÔøΩ ‡¶æ‡¶®‡¶áÔøΩ‡¶ß‡¶ßÔøΩ‡¶ßÔøΩ‡¶ß 
Reference: ‡¶π‡ßü ‡¶®‡¶æ ‡¶ï‡ßá‡¶®
BLEU Score: 0.0000
Character Accuracy: 0.0000

Example 2:
Banglish: 15k budget a 635 moteo jay na
Predicted: ÔøΩÔøΩÔøΩÔøΩ‡¶ßÔøΩÔøΩÔøΩÔøΩÔøΩ ÔøΩ‡¶ßÔøΩÔøΩÔøΩ ‡¶æÔøΩ‡¶æ‡¶®ÔøΩ‡¶®‡¶áÔøΩ‡¶áÔøΩÔøΩ‡¶ß‡¶ßÔøΩÔøΩ
Reference: ‡ßß‡ß´‡¶ï ‡¶¨‡¶æ‡¶ú‡ßá‡¶ü ‡¶è ‡ß¨‡ß©‡ß´ ‡¶Æ‡ßã‡¶ü‡ßá‡¶ì ‡¶Ø‡¶æ‡ßü ‡¶®‡¶æ
BLEU Score: 0.0000
Character Accuracy: 0.0000

Example 3:
Banglish: Sorry vai dite vule gesi
Predicted: ÔøΩÔøΩÔøΩÔøΩ‡¶ß‡¶ßÔøΩ‡¶ßÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩ ÔøΩÔøΩ ‡¶æÔøΩ‡¶æ‡¶®ÔøΩ‡¶®‡¶áÔøΩ‡¶áÔøΩÔøΩÔøΩ‡¶ßÔøΩ‡¶ßÔøΩ‡¶ß
Reference: ‡¶∏‡¶∞‡¶ø ‡¶≠‡¶æ‡¶á ‡¶¶‡¶ø‡¶§‡ßá ‡¶≠‡ßÅ‡¶≤‡ßá ‡¶ó‡ßá‡¶õ‡¶ø
BLEU Score: 0.0000
Character Accuracy: 0.0286

Example 4:
Banglish: fast kaj kore ki?
Predicted: ÔøΩÔøΩÔøΩÔøΩ‡¶ßÔøΩÔ


