In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
lines = pd.read_csv('/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv')
lines = lines[:30000]

# Lowercase all characters
lines['english'] = lines['english'].apply(lambda x: str(x).lower().strip())
lines['hindi'] = lines['hindi'].apply(lambda x: str(x).lower().strip())

# Remove quotes and special characters
lines['english'] = lines['english'].apply(lambda x: re.sub("'", '', x))
lines['hindi'] = lines['hindi'].apply(lambda x: re.sub("'", '', x))
exclude = set(string.punctuation)
lines['english'] = lines['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi'] = lines['hindi'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove digits
remove_digits = str.maketrans('', '', string.digits)
lines['english'] = lines['english'].apply(lambda x: x.translate(remove_digits))
lines['hindi'] = lines['hindi'].apply(lambda x: x.translate(remove_digits))

# Remove extra spaces and add start and end tokens to target sequences
lines['english'] = lines['english'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi'] = lines['hindi'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi'] = lines['hindi'].apply(lambda x: 'START_ ' + x + ' _END')

# Display a few examples to check preprocessing
print(lines.head())


# Split the data into train, validation, and test sets
X, y = lines['english'], lines['hindi']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Build vocabulary
all_eng_words = set(word for eng in X_train for word in eng.split())
all_hindi_words = set(word for hin in y_train for word in hin.split())

# Create word to index and index to word dictionaries for both languages
input_token_index = {word: i + 1 for i, word in enumerate(sorted(all_eng_words))}
target_token_index = {word: i + 1 for i, word in enumerate(sorted(all_hindi_words))}
reverse_input_token_index = {i: word for word, i in input_token_index.items()}
reverse_target_token_index = {i: word for word, i in target_token_index.items()}

# Set the maximum sequence lengths
max_length_src = max(len(eng.split()) for eng in X_train)
max_length_tar = max(len(hin.split()) for hin in y_train)

# Print stats
print(f"Max length of English Sentence: {max_length_src}")
print(f"Max length of Hindi Sentence: {max_length_tar}")
print(f"Total English Words: {len(input_token_index)}")
print(f"Total Hindi Words: {len(target_token_index)}")


import torch
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, X, y, input_token_index, target_token_index, max_length_src, max_length_tar):
        self.X = X
        self.y = y
        self.input_token_index = input_token_index
        self.target_token_index = target_token_index
        self.max_length_src = max_length_src
        self.max_length_tar = max_length_tar

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_seq = self.X.iloc[idx]
        target_seq = self.y.iloc[idx]

        # Tokenize and pad/truncate sequences
        encoder_input_data = torch.zeros(self.max_length_src, dtype=torch.long)
        decoder_input_data = torch.zeros(self.max_length_tar, dtype=torch.long)
        decoder_target_data = torch.zeros(self.max_length_tar, dtype=torch.long)

        # Populate encoder_input_data with the input sequence tokens
        for t, word in enumerate(input_seq.split()[:self.max_length_src]):  # Truncate if longer than max_length_src
            encoder_input_data[t] = self.input_token_index.get(word, 0)  # 0 for unknown words

        # Populate decoder_input_data and decoder_target_data with the target sequence tokens
        target_words = target_seq.split()[:self.max_length_tar]  # Truncate if longer than max_length_tar
        for t, word in enumerate(target_words):
            decoder_input_data[t] = self.target_token_index.get(word, 0)  # 0 for unknown words
            if t > 0:
                decoder_target_data[t - 1] = self.target_token_index.get(word, 0)

        return encoder_input_data, decoder_input_data, decoder_target_data

# Create DataLoader for train, validation, and test
train_dataset = TranslationDataset(X_train, y_train, input_token_index, target_token_index, max_length_src, max_length_tar)
val_dataset = TranslationDataset(X_val, y_val, input_token_index, target_token_index, max_length_src, max_length_tar)
test_dataset = TranslationDataset(X_test, y_test, input_token_index, target_token_index, max_length_src, max_length_tar)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

                                               hindi  \
0  START_ अपने अनुप्रयोग को पहुंचनीयता व्यायाम का...   
1        START_ एक्सेर्साइसर पहुंचनीयता अन्वेषक _END   
2   START_ निचले पटल के लिए डिफोल्ट प्लगइन खाका _END   
3    START_ ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका _END   
4  START_ उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप...   

                                          english  
0  give your application an accessibility workout  
1               accerciser accessibility explorer  
2  the default plugin layout for the bottom panel  
3     the default plugin layout for the top panel  
4  a list of plugins that are disabled by default  
Max length of English Sentence: 62
Max length of Hindi Sentence: 46
Total English Words: 2518
Total Hindi Words: 3208


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(hidden_dim * 2 + emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim * 3 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        
        a = self.attention(hidden[-1], encoder_outputs)
        a = a.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        
        return prediction, hidden, cell


class Seq2SeqAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        
        # Adjust hidden and cell for the decoder
        hidden = hidden[-self.decoder.lstm.num_layers:]
        cell = cell[-self.decoder.lstm.num_layers:]
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        
        return outputs

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import random
from tqdm import tqdm
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu

In [5]:
# After the loader section and the new model definition, update the training section:

# Hyperparameters
INPUT_DIM = len(input_token_index) + 1
OUTPUT_DIM = len(target_token_index) + 1
ENC_EMB_DIM = 512
DEC_EMB_DIM = 512
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Model components
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
attn = Attention(HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT, attn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2SeqAttention(enc, dec, device).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Initialize weights
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the padding index

# Modify the train_model function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, teacher_forcing_ratio=0.5, clip=1.0):
    global best_loss, best_model_state, no_improvement_count

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for encoder_input_data, decoder_input_data, decoder_target_data in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            encoder_input_data = encoder_input_data.transpose(0, 1).to(device)
            decoder_input_data = decoder_input_data.transpose(0, 1).to(device)
            decoder_target_data = decoder_target_data.transpose(0, 1).to(device)

            optimizer.zero_grad()

            # Forward pass
            output = model(encoder_input_data, decoder_input_data, teacher_forcing_ratio)

            # Compute the loss
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            decoder_target_data = decoder_target_data[1:].reshape(-1)
            loss = criterion(output, decoder_target_data)

            # Backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')

        # Validation step
        model.eval()
        val_bleu = evaluate_bleu_score(model, val_loader, reverse_target_token_index)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Val BLEU Score: {val_bleu:.4f}')

        # Check if the loss has improved
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_model_state = model.state_dict()
            no_improvement_count = 0
            print("Loss improved, saving model...")
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print("No improvement for several epochs, reducing learning rate...")
                model.load_state_dict(best_model_state)
                lr = optimizer.param_groups[0]['lr'] * lr_factor
                optimizer.param_groups[0]['lr'] = lr
                print(f"Learning rate reduced to {lr:.6f}")
                no_improvement_count = 0

                if lr < 1e-6:
                    print("Learning rate too low, stopping training.")
                    break

    return val_bleu

# Modify the evaluate_bleu_score function
def evaluate_bleu_score(model, data_loader, reverse_target_token_index, pad_idx=0):
    model.eval()
    total_bleu_score = 0
    smoothing_function = SmoothingFunction().method4
    with torch.no_grad():
        for encoder_input_data, decoder_input_data, decoder_target_data in tqdm(data_loader, desc="Evaluating"):
            encoder_input_data = encoder_input_data.transpose(0, 1).to(device)
            decoder_input_data = decoder_input_data.transpose(0, 1).to(device)

            # Get model predictions
            outputs = model(encoder_input_data, decoder_input_data, teacher_forcing_ratio=0.0)

            # Convert outputs to words
            predictions = torch.argmax(outputs, dim=2)
            predicted_sentences = []
            actual_sentences = []

            for sent in predictions.transpose(0, 1):
                sentence = []
                for idx in sent:
                    if idx.item() == pad_idx:
                        continue
                    word = reverse_target_token_index.get(idx.item(), '<UNK>')
                    sentence.append(word)
                predicted_sentences.append(sentence)

            for sent in decoder_target_data:
                sentence = []
                for idx in sent:
                    if idx.item() == pad_idx:
                        continue
                    word = reverse_target_token_index.get(idx.item(), '<UNK>')
                    sentence.append(word)
                actual_sentences.append(sentence)

            # Calculate BLEU score with smoothing
            for pred_sentence, actual_sentence in zip(predicted_sentences, actual_sentences):
                total_bleu_score += sentence_bleu([actual_sentence], pred_sentence, smoothing_function=smoothing_function)

    avg_bleu_score = total_bleu_score / len(data_loader.dataset)
    return avg_bleu_score

# Train the model
best_loss = float('inf')
patience = 3
lr_factor = 0.5
num_epochs = 10

# Tune learning rate on validation data
learning_rates = [0.0001]
best_bleu = 0
best_lr = 0.001

for lr in learning_rates:
    optimizer = optim.Adam(model.parameters(), lr=lr)
    print(f'\nTraining with learning rate: {lr}')
    val_bleu = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5, teacher_forcing_ratio=0.5, clip=1.0)

    if val_bleu > best_bleu:
        best_bleu = val_bleu
        best_lr = lr

print(f'Best learning rate: {best_lr} with BLEU Score: {best_bleu:.4f}')

# Train with the best learning rate
optimizer = optim.Adam(model.parameters(), lr=best_lr)
val_bleu = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=num_epochs, teacher_forcing_ratio=0.5, clip=1.0)

# Evaluate the model on the test data
test_bleu_score = evaluate_bleu_score(model, test_loader, reverse_target_token_index)
print(f'BLEU Score on Test Data: {test_bleu_score:.4f}')

The model has 27,097,225 trainable parameters

Training with learning rate: 0.0001


Epoch 1/5: 100%|██████████| 329/329 [02:36<00:00,  2.10it/s]


Epoch [1/5], Loss: 5.1311


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.89it/s]


Epoch [1/5], Val BLEU Score: 0.1631
Loss improved, saving model...


Epoch 2/5: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [2/5], Loss: 4.4009


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.85it/s]


Epoch [2/5], Val BLEU Score: 0.1650
Loss improved, saving model...


Epoch 3/5: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [3/5], Loss: 4.1280


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.85it/s]


Epoch [3/5], Val BLEU Score: 0.1647
Loss improved, saving model...


Epoch 4/5: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [4/5], Loss: 3.9579


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch [4/5], Val BLEU Score: 0.1648
Loss improved, saving model...


Epoch 5/5: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [5/5], Loss: 3.8198


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch [5/5], Val BLEU Score: 0.1652
Loss improved, saving model...
Best learning rate: 0.0001 with BLEU Score: 0.1652


Epoch 1/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [1/10], Loss: 3.7537


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch [1/10], Val BLEU Score: 0.1647
Loss improved, saving model...


Epoch 2/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [2/10], Loss: 3.5755


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.87it/s]


Epoch [2/10], Val BLEU Score: 0.1630
Loss improved, saving model...


Epoch 3/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [3/10], Loss: 3.4365


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.87it/s]


Epoch [3/10], Val BLEU Score: 0.1628
Loss improved, saving model...


Epoch 4/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [4/10], Loss: 3.2993


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.87it/s]


Epoch [4/10], Val BLEU Score: 0.1612
Loss improved, saving model...


Epoch 5/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [5/10], Loss: 3.1732


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch [5/10], Val BLEU Score: 0.1610
Loss improved, saving model...


Epoch 6/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [6/10], Loss: 3.0354


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.84it/s]


Epoch [6/10], Val BLEU Score: 0.1590
Loss improved, saving model...


Epoch 7/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [7/10], Loss: 2.9118


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch [7/10], Val BLEU Score: 0.1591
Loss improved, saving model...


Epoch 8/10: 100%|██████████| 329/329 [02:41<00:00,  2.04it/s]


Epoch [8/10], Loss: 2.7992


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.85it/s]


Epoch [8/10], Val BLEU Score: 0.1572
Loss improved, saving model...


Epoch 9/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [9/10], Loss: 2.6960


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.84it/s]


Epoch [9/10], Val BLEU Score: 0.1548
Loss improved, saving model...


Epoch 10/10: 100%|██████████| 329/329 [02:40<00:00,  2.05it/s]


Epoch [10/10], Loss: 2.5930


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.85it/s]


Epoch [10/10], Val BLEU Score: 0.1556
Loss improved, saving model...


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.84it/s]

BLEU Score on Test Data: 0.1554





In [6]:
import torch
import random

# Function to save model parameters
def save_model(model, optimizer, epoch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, filename)

# Save the best model
save_model(model, optimizer, num_epochs, best_loss, 'best_model.pth')
print("Model saved successfully.")

# Function to load model parameters
def load_model(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return model, optimizer, epoch, loss

# To load the model later, you can use:
loaded_model, loaded_optimizer, loaded_epoch, loaded_loss = load_model(model, optimizer, 'best_model.pth')
print(f"Loaded model from epoch {loaded_epoch} with loss {loaded_loss}")

 

Model saved successfully.
Loaded model from epoch 10 with loss 2.5930108085591743


  checkpoint = torch.load(filename)


In [6]:
# Function to translate a sentence
def translate_sentence(model, sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device):
    model.eval()
    with torch.no_grad():
        # Tokenize and pad the input sentence
        encoder_input = torch.zeros(max_length_src, 1, dtype=torch.long).to(device)
        for t, word in enumerate(sentence.split()[:max_length_src]):
            encoder_input[t, 0] = input_token_index.get(word, 0)
        
        # Generate the translation
        decoder_input = torch.tensor([[target_token_index['START_']]]).to(device)
        decoded_sentence = []
        
        encoder_outputs, hidden, cell = model.encoder(encoder_input)
        
        # Adjust hidden and cell for the decoder by combining the bidirectional outputs
        hidden = hidden[-model.decoder.lstm.num_layers:] + hidden[:model.decoder.lstm.num_layers]
        cell = cell[-model.decoder.lstm.num_layers:] + cell[:model.decoder.lstm.num_layers]
        
        for _ in range(max_length_tar):
            output, hidden, cell = model.decoder(decoder_input.squeeze(0), hidden, cell, encoder_outputs)
            predicted_token = output.argmax(1).item()
            decoded_word = reverse_target_token_index.get(predicted_token, '<UNK>')
            decoded_sentence.append(decoded_word)
            
            if decoded_word == '_END':
                break
            
            decoder_input = torch.tensor([[predicted_token]], device=device)
    
    return ' '.join(decoded_sentence)

# Evaluate BLEU score on test set
test_bleu_score = evaluate_bleu_score(model, test_loader, reverse_target_token_index)
print(f'BLEU Score on Test Data: {test_bleu_score:.4f}')

# Show random translation examples
num_examples = 5
random_indices = random.sample(range(len(X_test)), num_examples)
for idx in random_indices:
    input_sentence = X_test.iloc[idx]
    target_sentence = y_test.iloc[idx]
    translated_sentence = translate_sentence(model, input_sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device)
    
    print(f"\nExample {idx + 1}:")
    print(f"Input (English): {input_sentence}")
    print(f"Target (Hindi): {target_sentence}")
    print(f"Model Output: {translated_sentence}")


Evaluating: 100%|██████████| 71/71 [00:18<00:00,  3.80it/s]

BLEU Score on Test Data: 0.1180

Example 1643:
Input (English): move a onto the red joker
Target (Hindi): START_  a को लाल जोकर पर ले जाएँ _END
Model Output: को लाल पर पर ले जाएँ _END

Example 4076:
Input (English): file url to remove
Target (Hindi): START_ फ़ाइल url को हटाएँः _END
Model Output: url हटाएँः हटाएँः _END

Example 3956:
Input (English):  edit
Target (Hindi): START_ संपादन e _END
Model Output: करेंmenu _END

Example 3572:
Input (English):  properties
Target (Hindi): START_ गुण p _END
Model Output: गुण _END

Example 1288:
Input (English): target name
Target (Hindi): START_ परियोजना नामः _END
Model Output: नामः _END





In [7]:
# Show random translation examples with at least 10 tokens
num_examples = 5
filtered_indices = [idx for idx, sentence in enumerate(X_test) if len(sentence.split()) >= 10]
random_indices = random.sample(filtered_indices, num_examples)

for idx in random_indices:
    input_sentence = X_test.iloc[idx]
    target_sentence = y_test.iloc[idx]
    translated_sentence = translate_sentence(model, input_sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device)
    
    print(f"\nExample {idx + 1}:")
    print(f"Input (English): {input_sentence}")
    print(f"Target (Hindi): {target_sentence}")
    print(f"Model Output: {translated_sentence}")



Example 207:
Input (English): place the jack of diamonds next to the ten of diamonds
Target (Hindi): START_ ईंट का गुलाम के बगल में ईंट का दहला को रखें _END
Model Output: का गुलाम के बगल में ईंट का दहला को रखें _END

Example 1410:
Input (English): project name it can contain spaces by example gnu autoconf
Target (Hindi): START_ परियोजना नाम द्वारा उदाहरण _END
Model Output: नाम नहीं नाम उदाहरण से _END

Example 1680:
Input (English): a list of uris for partitions to be excluded from scanning
Target (Hindi): START_ स्कैनिंग से अलग करने के लिए विभाजन के लिए uris की सूचि _END
Model Output: से अलग के लिए द्वारा द्वारा से के लिए की सूचि _END

Example 1441:
Input (English): place the queen of spades next to the jack of spades
Target (Hindi): START_ हुकुम की बेगम के बगल में हुकुम के गुलाम को रखें _END
Model Output: की बेगम के बगल में हुकुम के गुलाम को रखें _END

Example 1592:
Input (English): cannot compile s no compile rule defined for this file type
Target (Hindi): START_ से नहीं नियम के लिए

In [39]:
def beam_search(model, sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device, beam_size=3):
    model.eval()
    with torch.no_grad():
        # Tokenize and pad the input sentence
        encoder_input = torch.zeros(max_length_src, 1, dtype=torch.long).to(device)
        for t, word in enumerate(sentence.split()[:max_length_src]):
            encoder_input[t, 0] = input_token_index.get(word, 0)
        
        # Encode the input sentence
        encoder_outputs, hidden, cell = model.encoder(encoder_input)
        
        # Adjust hidden and cell for the decoder
        hidden = hidden[-model.decoder.lstm.num_layers:] + hidden[:model.decoder.lstm.num_layers]
        cell = cell[-model.decoder.lstm.num_layers:] + cell[:model.decoder.lstm.num_layers]
        
        # Initialize the beam
        beam = [(torch.tensor([[target_token_index['START_']]], device=device), hidden, cell, [target_token_index['START_']], 0)]
        
        for _ in range(max_length_tar):
            candidates = []
            for decoder_input, hidden, cell, sequence, score in beam:
                if sequence[-1] == target_token_index['_END']:
                    candidates.append((decoder_input, hidden, cell, sequence, score))
                    continue
                
                output, new_hidden, new_cell = model.decoder(decoder_input.squeeze(0), hidden, cell, encoder_outputs)
                log_probs = F.log_softmax(output, dim=1)
                top_log_probs, top_indices = log_probs.topk(beam_size)
                
                for i in range(beam_size):
                    token = top_indices[0][i].item()
                    new_score = score + top_log_probs[0][i].item()
                    new_sequence = sequence + [token]
                    candidates.append((torch.tensor([[token]], device=device), new_hidden, new_cell, new_sequence, new_score))
            
            # Select the top beam_size candidates
            beam = sorted(candidates, key=lambda x: x[4], reverse=True)[:beam_size]
            
            # Check if all beams have ended
            if all(sequence[-1] == target_token_index['_END'] for _, _, _, sequence, _ in beam):
                break
    
    # Return the best sequence or an empty list if no valid translation
    if beam:
        best_sequence = beam[0][3]
        decoded_sentence = [reverse_target_token_index.get(token, '<UNK>') for token in best_sequence]
        return ' '.join(decoded_sentence[1:-1])  # Exclude the 'START_' and '_END' tokens
    else:
        return ''

In [40]:
def evaluate_bleu_score_beam_search(model, data_loader, input_token_index, target_token_index, reverse_target_token_index, reverse_input_token_index, max_length_src, max_length_tar, device):
    model.eval()
    references = []
    hypotheses = []
    smoothing_function = SmoothingFunction().method4

    with torch.no_grad():
        for encoder_input_data, decoder_input_data, decoder_target_data in data_loader:
            encoder_input_data = encoder_input_data.transpose(0, 1).to(device)
            decoder_target_data = decoder_target_data.transpose(0, 1).to(device)
            
            for i in range(encoder_input_data.size(1)):
                try:
                    input_sentence = ' '.join([reverse_input_token_index.get(idx.item(), '<UNK>') for idx in encoder_input_data[:, i] if idx.item() != 0])
                    target_sentence = ' '.join([reverse_target_token_index.get(idx.item(), '<UNK>') for idx in decoder_target_data[:, i] if idx.item() != 0])
                    
                    translated_sentence = beam_search(model, input_sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device)
                    
                    if translated_sentence:
                        references.append([target_sentence.split()])
                        hypotheses.append(translated_sentence.split())
                except Exception as e:
                    print(f"Error processing sentence {i}: {str(e)}")
                    continue

    if references and hypotheses:
        bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing_function)
    else:
        bleu_score = 0.0
    return bleu_score

In [41]:
# Evaluate BLEU score on test set using beam search
test_bleu_score_beam = evaluate_bleu_score_beam_search(
    model=model,
    data_loader=test_loader,
    input_token_index=input_token_index,
    target_token_index=target_token_index,
    reverse_target_token_index=reverse_target_token_index,
    reverse_input_token_index=reverse_input_token_index,
    max_length_src=max_length_src,
    max_length_tar=max_length_tar,
    device=device
)

print(f'BLEU Score on Test Data (Beam Search): {test_bleu_score_beam:.4f}')

BLEU Score on Test Data (Beam Search): 0.0766


In [42]:
import random

def translate_random_sentences(model, X_test, y_test, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device, num_examples=5):
    model.eval()
    random_indices = random.sample(range(len(X_test)), num_examples)
    
    print("\nRandom Translation Examples using Beam Search:")
    print("----------------------------------------------")
    
    for idx in random_indices:
        input_sentence = X_test.iloc[idx]
        target_sentence = y_test.iloc[idx]
        
        # Perform beam search translation
        translated_sentence = beam_search(model, input_sentence, input_token_index, target_token_index, reverse_target_token_index, max_length_src, max_length_tar, device)
        
        print(f"\nExample {idx + 1}:")
        print(f"Input (English): {input_sentence}")
        print(f"Target (Hindi): {target_sentence}")
        print(f"Model Output (Beam Search): {translated_sentence}")
        
        # Optionally, you can compute BLEU score for this specific translation
        reference = [target_sentence.split()]
        hypothesis = translated_sentence.split()
        bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=SmoothingFunction().method1)
        print(f"BLEU Score: {bleu_score:.4f}")

# Use the function
translate_random_sentences(
    model=model,
    X_test=X_test,
    y_test=y_test,
    input_token_index=input_token_index,
    target_token_index=target_token_index,
    reverse_target_token_index=reverse_target_token_index,
    max_length_src=max_length_src,
    max_length_tar=max_length_tar,
    device=device,
    num_examples=5  # You can change this number to display more or fewer examples
)


Random Translation Examples using Beam Search:
----------------------------------------------

Example 18:
Input (English): refresh node
Target (Hindi): START_ आसंधि नोड को ताजा करें n _END
Model Output (Beam Search): गए नहीं
BLEU Score: 0.0000

Example 94:
Input (English): not implemented
Target (Hindi): START_ क्रियान्वित नहीं हुआ है _END
Model Output (Beam Search): गए को को
BLEU Score: 0.0000

Example 88:
Input (English): a list of plugins that are disabled by default
Target (Hindi): START_ उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है _END
Model Output (Beam Search): प्लगइनों की सूची जिन्हें डिफोल्ट रूप निष्क्रिय निष्क्रिय किया गया
BLEU Score: 0.4411

Example 146:
Input (English): key
Target (Hindi): START_ कुंजी _END
Model Output (Beam Search): 
BLEU Score: 0.0000

Example 123:
Input (English): col lection
Target (Hindi): START_ संग्रह _END
Model Output (Beam Search): 
BLEU Score: 0.0000
