In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [2]:
import pandas as pd

df = pd.read_csv("/kaggle/input/english-to-hindi-parallel-dataset/newdata.csv")  # Replace with actual path
df = df.dropna(subset=['english_sentence', 'hindi_sentence'])

df['eng'] = df['english_sentence'].astype(str)
df['hindi'] = df['hindi_sentence'].astype(str)

eng_sentences = df['eng'].tolist()
hin_sentences = df['hindi'].tolist()

In [3]:
def tokenize(sentence):
    return sentence.lower().strip().split()
from collections import Counter

def build_vocab(sentences, min_freq=1):
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenize(sentence))
    
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

eng_vocab = build_vocab(eng_sentences)
hin_vocab = build_vocab(hin_sentences)


In [4]:
class TranslationData(Dataset):
    def __init__(self,src,tgt,src_vocab,tgt_vocab):
        self.src = src
        self.tgt = tgt
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src)

    def __getitem__(self,indx):
        src_tokens = self.encode(self.src[indx],self.src_vocab)
        tgt_tokens = self.encode(self.tgt[indx],self.tgt_vocab)
        return torch.tensor(src_tokens),torch.tensor(tgt_tokens)

    def encode(self,sentence,vocab):
        tokens = sentence.lower().split()
        return[vocab['<sos>']]+[vocab.get(tok,vocab['<unk>']) for tok in tokens] + [vocab['<eos>']]


In [5]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_lengths = torch.tensor([len(seq) for seq in src_batch])
    tgt_lengths = torch.tensor([len(seq) for seq in tgt_batch])

    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=eng_vocab['<pad>'])
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=hin_vocab['<pad>'])

    return src_padded, tgt_padded, src_lengths, tgt_lengths

In [6]:
from torch.utils.data import DataLoader

train_dataset = TranslationData(eng_sentences, hin_sentences, eng_vocab, hin_vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [7]:
for src, tgt, src_mask, tgt_mask in train_loader:
    print("Source shape:", src.shape)
    print("Target shape:", tgt.shape)
    print("Source mask shape:", src_mask.shape)
    print("Target mask shape:", tgt_mask.shape)
    break

Source shape: torch.Size([8, 39])
Target shape: torch.Size([8, 38])
Source mask shape: torch.Size([8])
Target mask shape: torch.Size([8])


In [8]:
for src, tgt, *_ in train_loader:
    print("Input batch shape:", src.shape)
    break

Input batch shape: torch.Size([8, 29])


In [13]:
class Encoder(nn.Module):
    def __init__(self,input_dim,hidden_dim,num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(input_dim,hidden_dim)
        self.lstm = nn.LSTM(hidden_dim,1024,num_layers,bias=True,proj_size=hidden_dim,batch_first=True)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.nums_layers = num_layers
    def forward(self,x):
        embedding = self.embedding(x)
        output, (hidden,cell) = self.lstm(embedding)
        return output,hidden,cell

In [10]:
print("src batch size:", src.shape[1])
print("tgt batch size:", tgt.shape[1])


src batch size: 29
tgt batch size: 32


In [14]:
class Decoder(nn.Module):
    def __init__(self,output_dim, hidden_dim, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(output_dim,hidden_dim)
        self.lstm = nn.LSTM(hidden_dim,1024,num_layers,bias=True,proj_size=hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim,output_dim)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

    def forward(self,x,hidden,cell):
        embedding = self.embedding(x)
        outputs,(hidden,cell) = self.lstm(embedding, (hidden,cell))
        predictions = self.fc(outputs)
        return predictions, hidden,cell

In [None]:
import torch.optim as optim
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(hin_vocab)
HIDDEN_DIM = 512
EPOCHS = 5
device = 'cuda'

class seq2seq(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self,src,tgt):
        output,hidden,cell = self.encoder(src)
        prediction ,hidden,cell = self.decoder(tgt,hidden,cell)
        return prediction

encoder = Encoder(INPUT_DIM, HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, HIDDEN_DIM)
model = seq2seq(encoder,decoder).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=hin_vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    for src, tgt, _, _ in train_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        # Create decoder input for teacher forcing
        tgt_input = torch.zeros_like(tgt)
        tgt_input[:, 1:] = tgt[:, :-1]
        tgt_input[:, 0] = hin_vocab['<sos>']  # Assuming <sos> is at index 1

        optimizer.zero_grad()
        output = model(src, tgt_input)  # output shape: [batch_size, seq_len, output_dim]

        # Flatten output and target for loss
        output = output.view(-1, OUTPUT_DIM)
        tgt = tgt.view(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss:.4f}")

In [None]:

def evaluate(model, src, max_len=50):
    model.eval()
    with torch.no_grad():
        src = src.to(DEVICE)

        # Encode input
        enc_outputs, hidden, cell = model.encoder(src)

        batch_size = src.size(0)
        tgt_vocab_size = len(hin_vocab)
        outputs = []

        # Initialize decoder input with <sos>
        input_tok = torch.full((batch_size, 1), hin_vocab['<sos>'], dtype=torch.long, device=DEVICE)

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_tok, hidden, cell)
            pred = output[:, -1, :].argmax(-1, keepdim=True)  # get last predicted token
            outputs.append(pred)
            input_tok = pred  # use previous prediction as next input

        # Concatenate predictions: shape [batch_size, seq_len]
        outputs = torch.cat(outputs, dim=1)
    return outputs

def decode_sequence(token_ids, vocab):
    inv_vocab = {v: k for k, v in vocab.items()}
    return ' '.join([inv_vocab.get(tok.item(), '<unk>') for tok in token_ids if tok.item() != vocab['<eos>']])
batch_iterator = iter(train_loader)