In [2]:
from datasets import load_dataset
from collections import Counter
import re


In [3]:
data = load_dataset("cfilt/iitb-english-hindi")

README.md: 0.00B [00:00, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [5]:
# Convert into lists
english_sentences = [x['en'] for x in data['train']['translation']]
hindi_sentences   = [x['hi'] for x in data['train']['translation']]
english_sentences_val = [x['en'] for x in data['validation']['translation']]
hindi_sentences_val   = [x['hi'] for x in data['validation']['translation']]

print("English:", english_sentences[:5])
print("Hindi:", hindi_sentences[:5])
print("Hindi:", len(hindi_sentences))

English: ['Give your application an accessibility workout', 'Accerciser Accessibility Explorer', 'The default plugin layout for the bottom panel', 'The default plugin layout for the top panel', 'A list of plugins that are disabled by default']
Hindi: ['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका', 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका', 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है']
Hindi: 1659083


In [6]:
print("Hindi:", len(hindi_sentences_val))

Hindi: 520


In [7]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

def train_tokenizer(sentences, vocab_size=30000, lang="en"):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["<PAD>", "<SOS>", "<EOS>", "<UNK>"]
    )
    tokenizer.train_from_iterator(sentences, trainer)
    tokenizer.save(f"{lang}_tokenizer.json")
    return tokenizer

# Collect sentences
english_sentences = [x['translation']['en'] for x in data['train']]
hindi_sentences   = [x['translation']['hi'] for x in data['train']]

src_tokenizer = train_tokenizer(english_sentences, lang="en")
trg_tokenizer = train_tokenizer(hindi_sentences, lang="hi")










In [8]:
src_tokenizer = Tokenizer.from_file("en_tokenizer.json")
trg_tokenizer = Tokenizer.from_file("hi_tokenizer.json")


In [9]:
import torch
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, data, src_tokenizer, trg_tokenizer, max_len=50):
        self.data = data
        self.src_tok = src_tokenizer
        self.trg_tok = trg_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        idx = int(idx)
        src = self.data[idx]['translation']['en']
        trg = self.data[idx]['translation']['hi']

        # Encode
        src_ids = [self.src_tok.token_to_id("<SOS>")] + self.src_tok.encode(src).ids[:self.max_len-2] + [self.src_tok.token_to_id("<EOS>")]
        trg_ids = [self.trg_tok.token_to_id("<SOS>")] + self.trg_tok.encode(trg).ids[:self.max_len-2] + [self.trg_tok.token_to_id("<EOS>")]

        return torch.tensor(src_ids), torch.tensor(trg_ids)

train_data = list(data['train'])
test_data  = list(data['test'])

train_dataset = TranslationDataset(data['train'], src_tokenizer, trg_tokenizer)
test_dataset  = TranslationDataset(data['test'], src_tokenizer, trg_tokenizer)



In [10]:
val_dataset  = TranslationDataset(data['validation'], src_tokenizer, trg_tokenizer)

In [14]:
from torch.nn.utils.rnn import pad_sequence

pad_idx = src_tokenizer.token_to_id("<PAD>")

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_tokenizer.token_to_id("<PAD>"), batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=trg_tokenizer.token_to_id("<PAD>"), batch_first=True)
    return src_batch, trg_batch

In [15]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers=1, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))   # [batch, src_len, embed_dim]
        outputs, hidden = self.rnn(embedded)           # outputs: [batch, src_len, hidden_dim]
        return outputs, hidden


In [16]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [1, batch, hidden_dim]
        hidden = hidden[-1].unsqueeze(1)               # [batch, 1, hidden_dim]
        src_len = encoder_outputs.size(1)

        hidden = hidden.repeat(1, src_len, 1)          # [batch, src_len, hidden_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)          # [batch, src_len]

        return torch.softmax(attention, dim=1)


In [17]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, attention, num_layers=1, dropout=0.3):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(hidden_dim + embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 + embed_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)                     # [batch, 1]
        embedded = self.dropout(self.embedding(input)) # [batch, 1, embed_dim]

        # Attention
        attn_weights = self.attention(hidden, encoder_outputs)   # [batch, src_len]
        attn_weights = attn_weights.unsqueeze(1)                 # [batch, 1, src_len]

        context = torch.bmm(attn_weights, encoder_outputs)       # [batch, 1, hidden_dim]

        rnn_input = torch.cat((embedded, context), dim=2)        # [batch, 1, embed+hidden]
        output, hidden = self.rnn(rnn_input, hidden)

        prediction = self.fc(torch.cat((output, context, embedded), dim=2).squeeze(1))
        return prediction, hidden


In [18]:
#encoder + attention + decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)
        input = trg[:, 0]   # <SOS> token

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t, :] = output

            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs


In [19]:
INPUT_DIM = src_tokenizer.get_vocab_size()    # English vocab size
OUTPUT_DIM = trg_tokenizer.get_vocab_size()    # Hindi vocab size
EMBED_DIM = 256
HIDDEN_DIM = 512

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

attn = Attention(HIDDEN_DIM)
enc = Encoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM, attn)



model = Seq2Seq(enc, dec, device).to(device)
print(model)




Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30000, 256, padding_idx=0)
    (rnn): GRU(256, 512, batch_first=True, dropout=0.3)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(30000, 256, padding_idx=0)
    (rnn): GRU(768, 512, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1280, out_features=30000, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)


In [20]:
import torch
from torch.utils.data import DataLoader, Subset
import numpy as np
import os

In [21]:
def get_subset(dataset, n_samples,seed=42):
    np.random.seed(seed)
    subset_indices = np.random.choice(len(dataset), n_samples, replace=False)
    return Subset(dataset, subset_indices)

In [22]:
train_dataset_small = get_subset(train_dataset, n_samples=100000)  # 100k pairs
#val_dataset_small = get_subset(val_dataset, n_samples=5000)        # 5k pairs

train_loader = DataLoader(train_dataset_small, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False,collate_fn=collate_fn)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

Train batches: 1563, Val batches: 9


In [23]:
#pad_idx = hi_word2idx["<PAD>"]
pad_idx = src_tokenizer.token_to_id("<PAD>")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_idx,label_smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scaler = torch.cuda.amp.GradScaler()
accum_steps = 4  # simulate batch size 128
save_path = "checkpoints"
os.makedirs(save_path, exist_ok=True)

  scaler = torch.cuda.amp.GradScaler()


In [24]:
def train_model(num_epochs):
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for i, (src, trg) in enumerate(train_loader):
            src, trg = src.to(device), trg.to(device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                output = model(src, trg[:, :-1])  # shift target
                loss = criterion(output.reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))

            loss = loss / accum_steps
            scaler.scale(loss).backward()

            if (i + 1) % accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item()

            # Print progress every 500 steps
            if (i + 1) % 500 == 0:
                print(f"Epoch {epoch+1}, Step {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        val_loss = evaluate(val_loader)
        print(f"Epoch {epoch+1} Complete | Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Save checkpoint if improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(save_path, f"best_model_epoch{epoch+1}.pt"))
            print("Saved new best model!")


In [25]:
def evaluate(loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)
            with torch.cuda.amp.autocast():
                output = model(src, trg[:, :-1])
                loss = criterion(output.reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

In [26]:
train_model(num_epochs=5) 


  with torch.cuda.amp.autocast():


Epoch 1, Step 500/1563, Loss: 1.9456
Epoch 1, Step 1000/1563, Loss: 1.8956
Epoch 1, Step 1500/1563, Loss: 1.8961


  with torch.cuda.amp.autocast():


Epoch 1 Complete | Train Loss: 1.9100, Val Loss: 7.7220
✅ Saved new best model!
Epoch 2, Step 500/1563, Loss: 1.8354
Epoch 2, Step 1000/1563, Loss: 1.8115
Epoch 2, Step 1500/1563, Loss: 1.7707
Epoch 2 Complete | Train Loss: 1.8018, Val Loss: 7.5941
✅ Saved new best model!
Epoch 3, Step 500/1563, Loss: 1.7408
Epoch 3, Step 1000/1563, Loss: 1.6914
Epoch 3, Step 1500/1563, Loss: 1.7792
Epoch 3 Complete | Train Loss: 1.7434, Val Loss: 7.4863
✅ Saved new best model!
Epoch 4, Step 500/1563, Loss: 1.6994
Epoch 4, Step 1000/1563, Loss: 1.6779
Epoch 4, Step 1500/1563, Loss: 1.7028
Epoch 4 Complete | Train Loss: 1.6968, Val Loss: 7.4365
✅ Saved new best model!
Epoch 5, Step 500/1563, Loss: 1.7292
Epoch 5, Step 1000/1563, Loss: 1.6581
Epoch 5, Step 1500/1563, Loss: 1.5655
Epoch 5 Complete | Train Loss: 1.6571, Val Loss: 7.4160
✅ Saved new best model!


In [30]:
from nltk.translate.bleu_score import sentence_bleu

def translate_sentence(sentence):
    model.eval()
    src_ids = [src_tokenizer.token_to_id("<SOS>")] + src_tokenizer.encode(sentence).ids + [src_tokenizer.token_to_id("<EOS>")]
    src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)

    encoder_outputs, hidden = model.encoder(src_tensor)
    input = torch.tensor([trg_tokenizer.token_to_id("<SOS>")]).to(device)

    result = []
    for _ in range(50):
        output, hidden = model.decoder(input, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        if pred_token == trg_tokenizer.token_to_id("<EOS>"):
            break
        result.append(pred_token)
        input = torch.tensor([pred_token]).to(device)

    return trg_tokenizer.decode(result)



In [34]:
print(translate_sentence("how are you"))


तुम हैं तुम हैं
