-- https://huggingface.co/docs/transformers/en/model_doc/bert
-- https://huggingface.co/csebuetnlp/banglabert
-- based on above pre-tained model create english to bengali translation
-- Floow transformer architecture
-- use bert in encoder
-- use banglabert in decoder
-- use pytorch
-- put all hyber paramter in a config map like file path for english_to_bangla.csv, number of sentences, max_sentence length, vocab size, dmodel, learing rate, dff etc
-- input file is a csv with two colum named en, bn
-- fine rune the model with input file
-- add few example to test the translation
-- do not put comment


In [None]:
from google.colab import files
import io

uploaded = files.upload()

# Assuming only one file is uploaded
file_name = list(uploaded.keys())[0]
!unzip 'English to Bengali For Machine Translation Pre-Train.zip'


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

config = {
    'data_path': 'english_to_bangla.csv',
    'max_sentence_length': 128,
    'vocab_size': 30522,
    'd_model': 768,
    'learning_rate': 2e-5,
    'batch_size': 16,
    'num_epochs': 1000,
    'dff': 2048,
    'num_heads': 12,
    'num_layers': 6,
    'dropout': 0.1,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'early_stop_patience': 8,
    'early_stop_min_delta': 0.001,
    'max_translation_pairs': 39050
}

class TranslationDataset(Dataset):
    def __init__(self, english_texts, bengali_texts, en_tokenizer, bn_tokenizer, max_length):
        self.english_texts = english_texts
        self.bengali_texts = bengali_texts
        self.en_tokenizer = en_tokenizer
        self.bn_tokenizer = bn_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.english_texts)

    def __getitem__(self, idx):
        en_text = str(self.english_texts[idx])
        bn_text = str(self.bengali_texts[idx])

        en_encoding = self.en_tokenizer(
            en_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        bn_encoding = self.bn_tokenizer(
            bn_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'en_input_ids': en_encoding['input_ids'].flatten(),
            'en_attention_mask': en_encoding['attention_mask'].flatten(),
            'bn_input_ids': bn_encoding['input_ids'].flatten(),
            'bn_attention_mask': bn_encoding['attention_mask'].flatten()
        }

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def __call__(self, val_loss):
        # Check if validation loss improved significantly
        if val_loss < (self.best_loss - self.min_delta):
            self.best_loss = val_loss
            self.counter = 0
            self.early_stop = False
            return False
        else:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter}/{self.patience}")

            if self.counter >= self.patience:
                self.early_stop = True
                return True

        return False

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)

        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dff, max_length, dropout=0.1):
        super(TransformerDecoder, self).__init__()

        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_length)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=dff,
            dropout=dropout,
            batch_first=True
        )

        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        tgt = torch.clamp(tgt, 0, self.vocab_size - 1)

        seq_len = tgt.size(1)
        tgt = self.embedding(tgt) * np.sqrt(self.d_model)
        tgt = self.pos_encoding(tgt.transpose(0, 1)).transpose(0, 1)
        tgt = self.dropout(tgt)

        if tgt_mask is None:
            tgt_mask = self.generate_square_subsequent_mask(seq_len).to(tgt.device)

        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask, memory_key_padding_mask=memory_mask)
        output = self.fc_out(output)

        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf'))
        return mask

class EnglishToBengaliTranslator(nn.Module):
    def __init__(self, config):
        super(EnglishToBengaliTranslator, self).__init__()

        self.bert_encoder = BertModel.from_pretrained('bert-base-uncased')

        for param in self.bert_encoder.parameters():
            param.requires_grad = False

        bn_tokenizer = AutoTokenizer.from_pretrained('csebuetnlp/banglabert')
        self.bn_vocab_size = bn_tokenizer.vocab_size

        self.decoder = TransformerDecoder(
            vocab_size=self.bn_vocab_size,
            d_model=config['d_model'],
            num_heads=config['num_heads'],
            num_layers=config['num_layers'],
            dff=config['dff'],
            max_length=config['max_sentence_length'],
            dropout=config['dropout']
        )

    def forward(self, en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask):
        encoder_outputs = self.bert_encoder(
            input_ids=en_input_ids,
            attention_mask=en_attention_mask
        )

        memory = encoder_outputs.last_hidden_state
        memory_mask = ~en_attention_mask.bool()

        decoder_input = bn_input_ids[:, :-1]
        decoder_input = torch.clamp(decoder_input, 0, self.bn_vocab_size - 1)

        decoder_output = self.decoder(
            tgt=decoder_input,
            memory=memory,
            memory_mask=memory_mask
        )

        return decoder_output

def create_sample_data():
    sample_data = {
        'en': [
            "Hello, how are you?",
            "I love you.",
            "What is your name?",
            "Good morning.",
            "Thank you very much.",
            "The weather is nice today.",
            "I am fine.",
            "Where are you from?",
            "How old are you?",
            "Nice to meet you."
        ],
        'bn': [
            "হ্যালো, আপনি কেমন আছেন?",
            "আমি তোমাকে ভালোবাসি।",
            "আপনার নাম কি?",
            "সুপ্রভাত।",
            "আপনাকে অনেক ধন্যবাদ।",
            "আজ আবহাওয়া সুন্দর।",
            "আমি ভালো আছি।",
            "আপনি কোথা থেকে এসেছেন?",
            "আপনার বয়স কত?",
            "আপনার সাথে দেখা করে ভালো লাগলো।"
        ]
    }

    df = pd.DataFrame(sample_data)
    df.to_csv('english_to_bangla.csv', index=False)
    return df

def load_data(config):
    if not os.path.exists(config['data_path']):
        print(f"Creating sample data at {config['data_path']}")
        df = create_sample_data()
    else:
        df = pd.read_csv(config['data_path'])

    if config['max_translation_pairs'] > 0:
        df = df.head(config['max_translation_pairs'])
        print(f"Loaded {len(df)} translation pairs (limited by max_translation_pairs)")
    else:
        print(f"Loaded {len(df)} translation pairs")

    return df

def validate_model(model, val_loader, criterion, config):
    model.eval()
    total_val_loss = 0
    valid_batches = 0

    with torch.no_grad():
        for batch in val_loader:
            try:
                en_input_ids = batch['en_input_ids'].to(config['device'])
                en_attention_mask = batch['en_attention_mask'].to(config['device'])
                bn_input_ids = batch['bn_input_ids'].to(config['device'])
                bn_attention_mask = batch['bn_attention_mask'].to(config['device'])

                outputs = model(en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask)

                targets = bn_input_ids[:, 1:].contiguous()
                targets = torch.clamp(targets, 0, outputs.size(-1) - 1)
                outputs = outputs.contiguous()

                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                total_val_loss += loss.item()
                valid_batches += 1
            except Exception as e:
                print(f"Error in validation batch: {e}")
                continue

    if valid_batches == 0:
        print("Warning: No valid validation batches processed")
        return float('inf')

    avg_val_loss = total_val_loss / valid_batches
    return avg_val_loss

def train_model(model, train_loader, val_loader, config):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    early_stopping = EarlyStopping(
        patience=config['early_stop_patience'],
        min_delta=config['early_stop_min_delta']
    )

    best_model_state = None
    best_val_loss = float('inf')

    print(f"Starting training with early stopping (patience={config['early_stop_patience']}, min_delta={config['early_stop_min_delta']})")

    for epoch in range(config['num_epochs']):
        model.train()
        total_loss = 0
        valid_batches = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{config["num_epochs"]}')

        for batch in progress_bar:
            try:
                en_input_ids = batch['en_input_ids'].to(config['device'])
                en_attention_mask = batch['en_attention_mask'].to(config['device'])
                bn_input_ids = batch['bn_input_ids'].to(config['device'])
                bn_attention_mask = batch['bn_attention_mask'].to(config['device'])

                optimizer.zero_grad()

                outputs = model(en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask)

                targets = bn_input_ids[:, 1:].contiguous()
                targets = torch.clamp(targets, 0, outputs.size(-1) - 1)
                outputs = outputs.contiguous()

                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                total_loss += loss.item()
                valid_batches += 1
                progress_bar.set_postfix({'loss': loss.item()})
            except Exception as e:
                print(f"Error in training batch: {e}")
                continue

        if valid_batches == 0:
            print("Warning: No valid training batches processed")
            continue

        avg_train_loss = total_loss / valid_batches
        avg_val_loss = validate_model(model, val_loader, criterion, config)

        print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Best Val Losss: {best_val_loss:.4f}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with validation loss: {best_val_loss:.4f}")

        if early_stopping(avg_val_loss):
            print(f'Early stopping triggered at epoch {epoch+1}')
            print(f'Training stopped early. Best validation loss: {early_stopping.best_loss:.4f}')
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f'Loaded best model with validation loss: {best_val_loss:.4f}')
    else:
        print("Warning: No best model state saved")

    return model

def translate_text(model, text, en_tokenizer, bn_tokenizer, config, max_length=50):
    model.eval()

    with torch.no_grad():
        try:
            en_encoding = en_tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=config['max_sentence_length'],
                return_tensors='pt'
            )

            en_input_ids = en_encoding['input_ids'].to(config['device'])
            en_attention_mask = en_encoding['attention_mask'].to(config['device'])

            encoder_outputs = model.bert_encoder(
                input_ids=en_input_ids,
                attention_mask=en_attention_mask
            )

            memory = encoder_outputs.last_hidden_state
            memory_mask = ~en_attention_mask.bool()

            decoder_input = torch.tensor([[bn_tokenizer.cls_token_id if bn_tokenizer.cls_token_id is not None else 0]],
                                       device=config['device'])

            for _ in range(max_length):
                decoder_input_clamped = torch.clamp(decoder_input, 0, model.bn_vocab_size - 1)

                decoder_output = model.decoder(
                    tgt=decoder_input_clamped,
                    memory=memory,
                    memory_mask=memory_mask
                )

                next_token_logits = decoder_output[:, -1, :]
                next_token = torch.argmax(next_token_logits, dim=-1)

                decoder_input = torch.cat([decoder_input, next_token.unsqueeze(1)], dim=1)

                if next_token.item() == bn_tokenizer.sep_token_id:
                    break

            translated_tokens = decoder_input.squeeze().tolist()
            translated_text = bn_tokenizer.decode(translated_tokens, skip_special_tokens=True)

            return translated_text
        except Exception as e:
            print(f"Error translating text: {e}")
            return f"Translation failed: {text}"



In [None]:

print("Loading data...")
df = load_data(config)

print("Initializing tokenizers...")
en_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bn_tokenizer = AutoTokenizer.from_pretrained('csebuetnlp/banglabert')

print("Preparing datasets...")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = TranslationDataset(
    train_df['en'].tolist(),
    train_df['bn'].tolist(),
    en_tokenizer,
    bn_tokenizer,
    config['max_sentence_length']
)

val_dataset = TranslationDataset(
    val_df['en'].tolist(),
    val_df['bn'].tolist(),
    en_tokenizer,
    bn_tokenizer,
    config['max_sentence_length']
)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

print("Initializing model...")
model = EnglishToBengaliTranslator(config).to(config['device'])

print("Training model...")
model = train_model(model, train_loader, val_loader, config)

print("Saving model...")
torch.save(model.state_dict(), 'en_bn_translator.pth')



Loading data...
Loaded 39050 translation pairs (limited by max_translation_pairs)
Initializing tokenizers...
Preparing datasets...
Initializing model...
Training model...
Starting training with early stopping (patience=8, min_delta=0.001)


Epoch 1/1000:   5%|▌         | 104/1953 [00:42<12:20,  2.50it/s, loss=7.27]

In [None]:

print("\nTesting translations...")
test_sentences = [
    "a child in a pink dress is climbing up a set of stairs in an entry way .",
    "a girl going into a wooden building .",
    "a dog is running in the snow",
    "a dog running",
    "Hello, how are you?",
    "a man in an orange hat starring at something .",
    "I love you.",
    "a little girl climbing into a wooden playhouse .",
    "What is your name?",
    "two dogs of different breeds looking at each other on the road .",
    "Good morning.",
    "Thank you very much.",
    "Hello, how are you?",
    "I love you.",
    "What is your name?",
    "Good morning.",
    "Thank you very much.",
    "The weather is nice today."
]

for sentence in test_sentences:
    try:
        translation = translate_text(model, sentence, en_tokenizer, bn_tokenizer, config)
        print(f"English: {sentence}")
        print(f"Bengali: {translation}")
        print("-" * 50)
    except Exception as e:
        print(f"Error translating '{sentence}': {e}")
        print("-" * 50)