In [8]:
!pip install torch



In [9]:
!pip install torchtext



In [10]:
# -------------------------------------------
# Hindi → English Translation using Seq2Seq
# With PyTorch + NLTK (No HuggingFace)
# -------------------------------------------

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk
nltk.download("punkt")

# ----------------------------
# 1. Load Dataset
# ----------------------------
with open("hindi.txt", "r", encoding="utf-8") as f:
    hindi_sentences = f.read().splitlines()

with open("english.txt", "r", encoding="utf-8") as f:
    english_sentences = f.read().splitlines()

# ----------------------------
# 2. Preprocessing
# ----------------------------
from nltk.tokenize import word_tokenize

def tokenize(sentences, lang="english"):
    return [word_tokenize(s.lower()) for s in sentences]

hindi_tokens = tokenize(hindi_sentences, lang="hindi")
english_tokens = tokenize(english_sentences, lang="english")

def build_vocab(tokenized_sentences):
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    idx = 4
    for sent in tokenized_sentences:
        for word in sent:
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

src_vocab = build_vocab(hindi_tokens)
tgt_vocab = build_vocab(english_tokens)

inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}

def encode(sentences, vocab):
    return [[vocab.get(word, vocab["<unk>"]) for word in sent] + [vocab["<eos>"]] for sent in sentences]

src_data = encode(hindi_tokens, src_vocab)
tgt_data = encode(english_tokens, tgt_vocab)

# ----------------------------
# 3. DataLoader with Padding
# ----------------------------
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src, tgt in batch:
        src_batch.append(torch.tensor(src, dtype=torch.long))
        tgt_batch.append(torch.tensor(tgt, dtype=torch.long))
    src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"])
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab["<pad>"])
    return src_batch, tgt_batch

pairs = list(zip(src_data, tgt_data))
train_loader = DataLoader(pairs, batch_size=32, shuffle=True, collate_fn=collate_fn)

# ----------------------------
# 4. Seq2Seq Model
# ----------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden):
        if input.dim() == 0:
            input = input.unsqueeze(0)
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = tgt.shape[1]
        max_len = tgt.shape[0]
        tgt_vocab_size = len(tgt_vocab)

        outputs = torch.zeros(max_len, batch_size, tgt_vocab_size).to(self.device)
        hidden = self.encoder(src.transpose(0,1))

        input = torch.tensor([tgt_vocab["<sos>"]] * batch_size).to(self.device)

        for t in range(1, max_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[t] if teacher_force else top1
        return outputs

# ----------------------------
# 5. Training
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(tgt_vocab)
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64
HID_DIM = 128

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<pad>"])

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.4f}")

# ----------------------------
# 6. Simple Translation
# ----------------------------
def translate(sentence):
    model.eval()
    tokens = word_tokenize(sentence.lower())
    indices = [src_vocab.get(word, src_vocab["<unk>"]) for word in tokens] + [src_vocab["<eos>"]]
    src_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(1).to(device)

    hidden = model.encoder(src_tensor.transpose(0,1))
    input = torch.tensor([tgt_vocab["<sos>"]]).to(device)

    translated = []
    for _ in range(20):
        output, hidden = model.decoder(input, hidden)
        top1 = output.argmax(1).item()
        if top1 == tgt_vocab["<eos>"]:
            break
        translated.append(inv_tgt_vocab[top1])
        input = torch.tensor([top1]).to(device)
    return " ".join(translated)

print("\nSample Translation:")
print("Hindi: मैं घर जा रहा हूँ")
print("English:", translate("मैं घर जा रहा हूँ"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1 Loss: 2.9423
Epoch 2 Loss: 2.8638
Epoch 3 Loss: 2.8606
Epoch 4 Loss: 2.7130
Epoch 5 Loss: 2.6898

Sample Translation:
Hindi: मैं घर जा रहा हूँ
English: you rahul
