In [1]:
from google.colab import files
import io

uploaded = files.upload()

# Assuming only one file is uploaded
file_name = list(uploaded.keys())[0]
!unzip 'English to Bengali For Machine Translation Pre-Train.zip'


Saving English to Bengali For Machine Translation Pre-Train.zip to English to Bengali For Machine Translation Pre-Train.zip
Archive:  English to Bengali For Machine Translation Pre-Train.zip
  inflating: english_to_bangla.csv   
  inflating: EBook_of_The_Bhagavad-Gita_Bengali.txt  
  inflating: EBook_of_The_Bhagavad-Gita_English.txt  


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

config = {
    'data_path': 'english_to_bangla.csv',
    'max_sentence_length': 128,
    'vocab_size': 119547,
    'd_model': 768,
    'learning_rate': 2e-5,
    'batch_size': 16,
    'num_epochs': 1000,
    'dff': 2048,
    'num_heads': 12,
    'num_layers': 6,
    'dropout': 0.1,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'early_stop_patience': 3,
    'early_stop_min_delta': 0.001,
    'max_translation_pairs': 10
}

class TranslationDataset(Dataset):
    def __init__(self, english_texts, bengali_texts, tokenizer, max_length):
        self.english_texts = english_texts
        self.bengali_texts = bengali_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.english_texts)

    def __getitem__(self, idx):
        en_text = str(self.english_texts[idx])
        bn_text = str(self.bengali_texts[idx])

        en_encoding = self.tokenizer(
            en_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        bn_encoding = self.tokenizer(
            bn_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'en_input_ids': en_encoding['input_ids'].flatten(),
            'en_attention_mask': en_encoding['attention_mask'].flatten(),
            'bn_input_ids': bn_encoding['input_ids'].flatten(),
            'bn_attention_mask': bn_encoding['attention_mask'].flatten()
        }

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < (self.best_loss - self.min_delta):
            self.best_loss = val_loss
            self.counter = 0
            self.early_stop = False
            return False
        else:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter}/{self.patience}")

            if self.counter >= self.patience:
                self.early_stop = True
                return True

        return False

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)

        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dff, max_length, dropout=0.1):
        super(TransformerDecoder, self).__init__()

        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_length)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=dff,
            dropout=dropout,
            batch_first=True
        )

        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        tgt = torch.clamp(tgt, 0, self.vocab_size - 1)

        seq_len = tgt.size(1)
        tgt = self.embedding(tgt) * np.sqrt(self.d_model)
        tgt = self.pos_encoding(tgt.transpose(0, 1)).transpose(0, 1)
        tgt = self.dropout(tgt)

        if tgt_mask is None:
            tgt_mask = self.generate_square_subsequent_mask(seq_len).to(tgt.device)

        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask, memory_key_padding_mask=memory_mask)
        output = self.fc_out(output)

        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf'))
        return mask

class EnglishToBengaliTranslator(nn.Module):
    def __init__(self, config):
        super(EnglishToBengaliTranslator, self).__init__()

        self.mbert_encoder = AutoModel.from_pretrained('bert-base-multilingual-cased')

        for param in self.mbert_encoder.parameters():
            param.requires_grad = False

        tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.vocab_size = tokenizer.vocab_size

        self.decoder = TransformerDecoder(
            vocab_size=self.vocab_size,
            d_model=config['d_model'],
            num_heads=config['num_heads'],
            num_layers=config['num_layers'],
            dff=config['dff'],
            max_length=config['max_sentence_length'],
            dropout=config['dropout']
        )

    def forward(self, en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask):
        encoder_outputs = self.mbert_encoder(
            input_ids=en_input_ids,
            attention_mask=en_attention_mask
        )

        memory = encoder_outputs.last_hidden_state
        memory_mask = ~en_attention_mask.bool()

        decoder_input = bn_input_ids[:, :-1]
        decoder_input = torch.clamp(decoder_input, 0, self.vocab_size - 1)

        decoder_output = self.decoder(
            tgt=decoder_input,
            memory=memory,
            memory_mask=memory_mask
        )

        return decoder_output

def create_sample_data():
    sample_data = {
        'en': [
            "Hello, how are you?",
            "I love you.",
            "What is your name?",
            "Good morning.",
            "Thank you very much.",
            "The weather is nice today.",
            "I am fine.",
            "Where are you from?",
            "How old are you?",
            "Nice to meet you."
        ],
        'bn': [
            "হ্যালো, আপনি কেমন আছেন?",
            "আমি তোমাকে ভালোবাসি।",
            "আপনার নাম কি?",
            "সুপ্রভাত।",
            "আপনাকে অনেক ধন্যবাদ।",
            "আজ আবহাওয়া সুন্দর।",
            "আমি ভালো আছি।",
            "আপনি কোথা থেকে এসেছেন?",
            "আপনার বয়স কত?",
            "আপনার সাথে দেখা করে ভালো লাগলো।"
        ]
    }

    df = pd.DataFrame(sample_data)
    df.to_csv('english_to_bangla.csv', index=False)
    return df

def load_data(config):
    if not os.path.exists(config['data_path']):
        print(f"Creating sample data at {config['data_path']}")
        df = create_sample_data()
    else:
        df = pd.read_csv(config['data_path'])

    if config['max_translation_pairs'] > 0:
        df = df.head(config['max_translation_pairs'])
        print(f"Loaded {len(df)} translation pairs (limited by max_translation_pairs)")
    else:
        print(f"Loaded {len(df)} translation pairs")

    return df

def validate_model(model, val_loader, criterion, config):
    model.eval()
    total_val_loss = 0
    valid_batches = 0

    with torch.no_grad():
        for batch in val_loader:
            try:
                en_input_ids = batch['en_input_ids'].to(config['device'])
                en_attention_mask = batch['en_attention_mask'].to(config['device'])
                bn_input_ids = batch['bn_input_ids'].to(config['device'])
                bn_attention_mask = batch['bn_attention_mask'].to(config['device'])

                outputs = model(en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask)

                targets = bn_input_ids[:, 1:].contiguous()
                targets = torch.clamp(targets, 0, outputs.size(-1) - 1)
                outputs = outputs.contiguous()

                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                total_val_loss += loss.item()
                valid_batches += 1
            except Exception as e:
                print(f"Error in validation batch: {e}")
                continue

    if valid_batches == 0:
        print("Warning: No valid validation batches processed")
        return float('inf')

    avg_val_loss = total_val_loss / valid_batches
    return avg_val_loss

def train_model(model, train_loader, val_loader, config):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    early_stopping = EarlyStopping(
        patience=config['early_stop_patience'],
        min_delta=config['early_stop_min_delta']
    )

    best_model_state = None
    best_val_loss = float('inf')

    print(f"Starting training with early stopping (patience={config['early_stop_patience']}, min_delta={config['early_stop_min_delta']})")

    for epoch in range(config['num_epochs']):
        model.train()
        total_loss = 0
        valid_batches = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{config["num_epochs"]}')

        for batch in progress_bar:
            try:
                en_input_ids = batch['en_input_ids'].to(config['device'])
                en_attention_mask = batch['en_attention_mask'].to(config['device'])
                bn_input_ids = batch['bn_input_ids'].to(config['device'])
                bn_attention_mask = batch['bn_attention_mask'].to(config['device'])

                optimizer.zero_grad()

                outputs = model(en_input_ids, en_attention_mask, bn_input_ids, bn_attention_mask)

                targets = bn_input_ids[:, 1:].contiguous()
                targets = torch.clamp(targets, 0, outputs.size(-1) - 1)
                outputs = outputs.contiguous()

                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                total_loss += loss.item()
                valid_batches += 1
                progress_bar.set_postfix({'loss': loss.item()})
            except Exception as e:
                print(f"Error in training batch: {e}")
                continue

        if valid_batches == 0:
            print("Warning: No valid training batches processed")
            continue

        avg_train_loss = total_loss / valid_batches
        avg_val_loss = validate_model(model, val_loader, criterion, config)

        print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Best Val Loss: {best_val_loss:.4f}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with validation loss: {best_val_loss:.4f}")

        if early_stopping(avg_val_loss):
            print(f'Early stopping triggered at epoch {epoch+1}')
            print(f'Training stopped early. Best validation loss: {early_stopping.best_loss:.4f}')
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f'Loaded best model with validation loss: {best_val_loss:.4f}')
    else:
        print("Warning: No best model state saved")

    return model

def translate_text(model, text, tokenizer, config, max_length=50):
    model.eval()

    with torch.no_grad():
        try:
            en_encoding = tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=config['max_sentence_length'],
                return_tensors='pt'
            )

            en_input_ids = en_encoding['input_ids'].to(config['device'])
            en_attention_mask = en_encoding['attention_mask'].to(config['device'])

            encoder_outputs = model.mbert_encoder(
                input_ids=en_input_ids,
                attention_mask=en_attention_mask
            )

            memory = encoder_outputs.last_hidden_state
            memory_mask = ~en_attention_mask.bool()

            decoder_input = torch.tensor([[tokenizer.cls_token_id]],
                                       device=config['device'])

            for _ in range(max_length):
                decoder_input_clamped = torch.clamp(decoder_input, 0, model.vocab_size - 1)

                decoder_output = model.decoder(
                    tgt=decoder_input_clamped,
                    memory=memory,
                    memory_mask=memory_mask
                )

                next_token_logits = decoder_output[:, -1, :]
                next_token = torch.argmax(next_token_logits, dim=-1)

                decoder_input = torch.cat([decoder_input, next_token.unsqueeze(1)], dim=1)

                if next_token.item() == tokenizer.sep_token_id:
                    break

            translated_tokens = decoder_input.squeeze().tolist()
            translated_text = tokenizer.decode(translated_tokens, skip_special_tokens=True)

            return translated_text
        except Exception as e:
            print(f"Error translating text: {e}")
            return f"Translation failed: {text}"

In [4]:
print("Loading data...")
df = load_data(config)

print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

print("Preparing datasets...")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = TranslationDataset(
    train_df['en'].tolist(),
    train_df['bn'].tolist(),
    tokenizer,
    config['max_sentence_length']
)

val_dataset = TranslationDataset(
    val_df['en'].tolist(),
    val_df['bn'].tolist(),
    tokenizer,
    config['max_sentence_length']
)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

print("Initializing model...")
model = EnglishToBengaliTranslator(config).to(config['device'])

print("Training model...")
model = train_model(model, train_loader, val_loader, config)

print("Saving model...")
torch.save(model.state_dict(), 'en_bn_translator.pth')
    print("-" * 50)

Loading data...
Loaded 10 translation pairs (limited by max_translation_pairs)
Initializing tokenizer...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Preparing datasets...
Initializing model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Training model...
Starting training with early stopping (patience=3, min_delta=0.001)


Epoch 1/1000: 100%|██████████| 1/1 [00:31<00:00, 31.86s/it, loss=11.9]


Epoch 1, Train Loss: 11.9056, Val Loss: 11.6073, Best Val Loss: inf
New best model saved with validation loss: 11.6073


Epoch 2/1000: 100%|██████████| 1/1 [00:34<00:00, 34.91s/it, loss=11.6]


Epoch 2, Train Loss: 11.5919, Val Loss: 11.4230, Best Val Loss: 11.6073
New best model saved with validation loss: 11.4230


Epoch 3/1000: 100%|██████████| 1/1 [00:25<00:00, 25.76s/it, loss=11.3]


Epoch 3, Train Loss: 11.3482, Val Loss: 11.2520, Best Val Loss: 11.4230
New best model saved with validation loss: 11.2520


Epoch 4/1000: 100%|██████████| 1/1 [00:29<00:00, 29.85s/it, loss=11.1]


Epoch 4, Train Loss: 11.1203, Val Loss: 11.0927, Best Val Loss: 11.2520
New best model saved with validation loss: 11.0927


Epoch 5/1000: 100%|██████████| 1/1 [00:25<00:00, 25.34s/it, loss=10.9]


Epoch 5, Train Loss: 10.9231, Val Loss: 10.9468, Best Val Loss: 11.0927
New best model saved with validation loss: 10.9468


Epoch 6/1000: 100%|██████████| 1/1 [00:28<00:00, 28.25s/it, loss=10.8]


Epoch 6, Train Loss: 10.7577, Val Loss: 10.8129, Best Val Loss: 10.9468
New best model saved with validation loss: 10.8129


Epoch 7/1000: 100%|██████████| 1/1 [00:25<00:00, 25.18s/it, loss=10.6]


Epoch 7, Train Loss: 10.5693, Val Loss: 10.6922, Best Val Loss: 10.8129
New best model saved with validation loss: 10.6922


Epoch 8/1000: 100%|██████████| 1/1 [00:28<00:00, 28.53s/it, loss=10.4]


Epoch 8, Train Loss: 10.3834, Val Loss: 10.5800, Best Val Loss: 10.6922
New best model saved with validation loss: 10.5800


Epoch 9/1000: 100%|██████████| 1/1 [00:24<00:00, 24.94s/it, loss=10.2]


Epoch 9, Train Loss: 10.2402, Val Loss: 10.4750, Best Val Loss: 10.5800
New best model saved with validation loss: 10.4750


Epoch 10/1000: 100%|██████████| 1/1 [00:25<00:00, 25.14s/it, loss=10.1]


Epoch 10, Train Loss: 10.0719, Val Loss: 10.3768, Best Val Loss: 10.4750
New best model saved with validation loss: 10.3768


Epoch 11/1000: 100%|██████████| 1/1 [00:24<00:00, 24.97s/it, loss=9.95]


Epoch 11, Train Loss: 9.9467, Val Loss: 10.2835, Best Val Loss: 10.3768
New best model saved with validation loss: 10.2835


Epoch 12/1000: 100%|██████████| 1/1 [00:25<00:00, 25.12s/it, loss=9.83]


Epoch 12, Train Loss: 9.8315, Val Loss: 10.1960, Best Val Loss: 10.2835
New best model saved with validation loss: 10.1960


Epoch 13/1000: 100%|██████████| 1/1 [00:25<00:00, 25.19s/it, loss=9.69]


Epoch 13, Train Loss: 9.6939, Val Loss: 10.1145, Best Val Loss: 10.1960
New best model saved with validation loss: 10.1145


Epoch 14/1000: 100%|██████████| 1/1 [00:26<00:00, 26.93s/it, loss=9.56]


Epoch 14, Train Loss: 9.5603, Val Loss: 10.0383, Best Val Loss: 10.1145
New best model saved with validation loss: 10.0383


Epoch 15/1000: 100%|██████████| 1/1 [00:24<00:00, 24.38s/it, loss=9.45]


Epoch 15, Train Loss: 9.4512, Val Loss: 9.9665, Best Val Loss: 10.0383
New best model saved with validation loss: 9.9665


Epoch 16/1000: 100%|██████████| 1/1 [00:24<00:00, 24.71s/it, loss=9.35]


Epoch 16, Train Loss: 9.3479, Val Loss: 9.8996, Best Val Loss: 9.9665
New best model saved with validation loss: 9.8996


Epoch 17/1000: 100%|██████████| 1/1 [00:24<00:00, 24.96s/it, loss=9.23]


Epoch 17, Train Loss: 9.2259, Val Loss: 9.8371, Best Val Loss: 9.8996
New best model saved with validation loss: 9.8371


Epoch 18/1000: 100%|██████████| 1/1 [00:25<00:00, 25.11s/it, loss=9.15]


Epoch 18, Train Loss: 9.1544, Val Loss: 9.7762, Best Val Loss: 9.8371
New best model saved with validation loss: 9.7762


Epoch 19/1000: 100%|██████████| 1/1 [00:24<00:00, 24.95s/it, loss=9.09]


Epoch 19, Train Loss: 9.0896, Val Loss: 9.7195, Best Val Loss: 9.7762
New best model saved with validation loss: 9.7195


Epoch 20/1000:   0%|          | 0/1 [00:25<?, ?it/s]


KeyboardInterrupt: 

In [None]:
print("\nTesting translations...")
test_sentences = [
    "a child in a pink dress is climbing up a set of stairs in an entry way .",
    "a girl going into a wooden building .",
    "a dog is running in the snow",
    "a dog running",
    "Hello, how are you?",
    "a man in an orange hat starring at something .",
    "I love you.",
    "a little girl climbing into a wooden playhouse .",
    "What is your name?",
    "two dogs of different breeds looking at each other on the road .",
    "Good morning.",
    "Thank you very much.",
    "Hello, how are you?",
    "I love you.",
    "What is your name?",
    "Good morning.",
    "Thank you very much.",
    "The weather is nice today."
    ]

for sentence in test_sentences:
    try:
        translation = translate_text(model, sentence, tokenizer, config)
        print(f"English: {sentence}")
        print(f"Bengali: {translation}")
        print("-" * 50)
    except Exception as e:
        print(f"Error translating '{sentence}': {e}")



Testing translations...
English: a child in a pink dress is climbing up a set of stairs in an entry way .
Bengali: একটি
--------------------------------------------------
English: a girl going into a wooden building .
Bengali: একটি ক ক ক ক ক ক ক ক
--------------------------------------------------
English: a dog is running in the snow
Bengali: একটি ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক
--------------------------------------------------
English: a dog running
Bengali: একটি ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক ক
--------------------------------------------------
