## English to Hinglish Neural Maching Translator



In [7]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# -------------------------------
# 1. Load and preprocess data
# -------------------------------
data = []
filename = "hinglish_upload_v1.json"

with open(filename, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        data.append({
            "English": obj["translation"]["en"],
            "Hinglish": obj["translation"]["hi_ng"]
        })

df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text):
    return str(text).lower().strip()

en_sentences = df["English"].apply(preprocess_text).tolist()
hing_sentences = df["Hinglish"].apply(preprocess_text).tolist()
print("step 1 done")

# -------------------------------
# 2. Tokenization
# -------------------------------
vocab_size = 4000
max_length = 50

from collections import Counter

def build_vocab(sentences, vocab_size):
    words = [w for sent in sentences for w in sent.split()]
    most_common = [w for w, _ in Counter(words).most_common(vocab_size-2)]
    word2idx = {w: i+2 for i, w in enumerate(most_common)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    idx2word = {i:w for w,i in word2idx.items()}
    return word2idx, idx2word

# Build vocabularies
word2idx_en, idx2word_en = build_vocab(en_sentences, vocab_size)
word2idx_hing, idx2word_hing = build_vocab(['sos ' + s + ' eos' for s in hing_sentences], vocab_size)

def encode_sentence(sent, word2idx, max_length):
    tokens = sent.split()
    ids = [word2idx.get(w, 1) for w in tokens]
    ids = [word2idx_hing.get('sos', 2)] + ids if 'sos' in word2idx_hing else ids
    ids = ids[:max_length]
    ids += [0]*(max_length - len(ids))
    return ids

X_enc = np.array([encode_sentence(s, word2idx_en, max_length) for s in en_sentences])
X_dec = np.array([encode_sentence('sos '+s, word2idx_hing, max_length) for s in hing_sentences])
Y_seq = np.array([encode_sentence(s+' eos', word2idx_hing, max_length) for s in hing_sentences])

# Train / Validation split
X_train_enc = torch.tensor(X_enc[:150_000], dtype=torch.long)
X_valid_enc = torch.tensor(X_enc[150_000:], dtype=torch.long)
X_train_dec = torch.tensor(X_dec[:150_000], dtype=torch.long)
X_valid_dec = torch.tensor(X_dec[150_000:], dtype=torch.long)
Y_train = torch.tensor(Y_seq[:150_000], dtype=torch.long)
Y_valid = torch.tensor(Y_seq[150_000:], dtype=torch.long)

print("step 2 done")
# -------------------------------
# 3. Dataset & Dataloader
# -------------------------------
class TranslationDataset(Dataset):
    def __init__(self, enc, dec, target):
        self.enc = enc
        self.dec = dec
        self.target = target
    def __len__(self):
        return len(self.enc)
    def __getitem__(self, idx):
        return self.enc[idx], self.dec[idx], self.target[idx]

train_dataset = TranslationDataset(X_train_enc, X_train_dec, Y_train)
valid_dataset = TranslationDataset(X_valid_enc, X_valid_dec, Y_valid)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)
print("step 3 done")
# -------------------------------
# 4. Model Definition
# -------------------------------
class Seq2SeqAttention(nn.Module):
    def __init__(self, vocab_size, embed_size=128, enc_hidden=256, dec_hidden=512, max_length=50):
        super().__init__()
        self.enc_hidden = enc_hidden
        self.dec_hidden = dec_hidden

        # Embeddings
        self.enc_embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.dec_embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.enc_dropout = nn.Dropout(0.1)
        self.dec_dropout = nn.Dropout(0.1)

        # Encoder
        self.encoder = nn.LSTM(embed_size, enc_hidden, batch_first=True, bidirectional=True, dropout=0.1)

        # Decoder
        self.decoder = nn.LSTM(embed_size, dec_hidden, batch_first=True, dropout=0.1)

        # Attention (Additive / Bahdanau)
        self.attn = nn.Linear(enc_hidden*2 + dec_hidden, dec_hidden)
        self.v = nn.Linear(dec_hidden, 1, bias=False)

        # Dense
        self.fc1 = nn.Linear(dec_hidden + enc_hidden*2, 256)
        self.fc2 = nn.Linear(256, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, enc_input, dec_input):
        # Encoder
        enc_emb = self.enc_dropout(self.enc_embed(enc_input))
        enc_outputs, (h, c) = self.encoder(enc_emb)
        h = torch.cat((h[0], h[1]), dim=1).unsqueeze(0)
        c = torch.cat((c[0], c[1]), dim=1).unsqueeze(0)

        # Decoder
        dec_emb = self.dec_dropout(self.dec_embed(dec_input))
        dec_outputs, _ = self.decoder(dec_emb, (h, c))

        # Attention
        B, T, H = dec_outputs.size()
        enc_len = enc_outputs.size(1)
        context_vectors = []
        for t in range(T):
            dec_step = dec_outputs[:, t, :].unsqueeze(1).repeat(1, enc_len, 1)
            energy = torch.tanh(self.attn(torch.cat((dec_step, enc_outputs), dim=2)))
            attn_weight = F.softmax(self.v(energy).squeeze(2), dim=1)
            context = torch.bmm(attn_weight.unsqueeze(1), enc_outputs)
            context_vectors.append(context)
        context_vectors = torch.cat(context_vectors, dim=1)

        combined = torch.cat((dec_outputs, context_vectors), dim=2)
        out = F.relu(self.fc1(combined))
        out = self.dropout(out)
        logits = self.fc2(out)
        return logits
print("step 4 done")
# -------------------------------
# 5. Training Setup
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2SeqAttention(vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print("step 5 done")
# -------------------------------
# 6. Training Loop
# -------------------------------
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    step = 1
    for enc_batch, dec_batch, target_batch in train_loader:
        enc_batch, dec_batch, target_batch = enc_batch.to(device), dec_batch.to(device), target_batch.to(device)
        optimizer.zero_grad()
        output = model(enc_batch, dec_batch)  # [B, T, vocab]
        loss = criterion(output.view(-1, vocab_size), target_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        step+=1
        print(step,end="\r")
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_loader):.4f}")
    step = 0
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for enc_batch, dec_batch, target_batch in valid_loader:
            enc_batch, dec_batch, target_batch = enc_batch.to(device), dec_batch.to(device), target_batch.to(device)
            output = model(enc_batch, dec_batch)
            loss = criterion(output.view(-1, vocab_size), target_batch.view(-1))
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss/len(valid_loader):.4f}")
print("step 6 done")

step 1 done
step 2 done
step 3 done
step 4 done
step 5 done




Epoch 1/10 - Loss: 1.8513
Validation Loss: 1.0729
Epoch 2/10 - Loss: 0.9762
Validation Loss: 0.9029
Epoch 3/10 - Loss: 0.8182
Validation Loss: 0.8526
Epoch 4/10 - Loss: 0.7329
Validation Loss: 0.8389
Epoch 5/10 - Loss: 0.6770
Validation Loss: 0.8317
Epoch 6/10 - Loss: 0.6368
Validation Loss: 0.8424
Epoch 7/10 - Loss: 0.6047
Validation Loss: 0.8480
Epoch 8/10 - Loss: 0.5803
Validation Loss: 0.8552
Epoch 9/10 - Loss: 0.5589
Validation Loss: 0.8706
Epoch 10/10 - Loss: 0.5443
Validation Loss: 0.8784
step 6 done


In [68]:
# -------------------------------
# 7. Model Testing and Inference
# -------------------------------

def translate_sentence(model, sentence, word2idx_en, word2idx_hing, idx2word_hing, max_length=50, device='cpu'):
    model.eval()

    # Preprocess input sentence
    sentence = preprocess_text(sentence)

    # Encode input sentence
    enc_input = encode_sentence(sentence, word2idx_en, max_length)
    enc_input = torch.tensor([enc_input], dtype=torch.long).to(device)

    # Initialize decoder input with SOS token
    sos_token = word2idx_hing.get('sos', 2)
    dec_input = [sos_token]

    # Generate translation token by token
    with torch.no_grad():
        for _ in range(max_length):
            # Pad decoder input to max_length
            dec_input_padded = dec_input + [0] * (max_length - len(dec_input))
            dec_input_tensor = torch.tensor([dec_input_padded], dtype=torch.long).to(device)

            output = model(enc_input, dec_input_tensor)
            next_token = output[0, len(dec_input)-1, :].argmax().item()

            if next_token == word2idx_hing.get('eos', 3) or next_token == 0:
                break

            dec_input.append(next_token)

    # Convert tokens back to words (skip SOS token and filter out unwanted tokens)
    translation = []
    for token_id in dec_input[1:]:
        word = idx2word_hing.get(token_id, '<UNK>')
        if word not in ['<PAD>', 'eos', 'sos']:  # Filter out special tokens
            translation.append(word)

    return ' '.join(translation)



def test_model(model, test_sentences, word2idx_en, word2idx_hing, idx2word_hing, device='cpu'):


    for i, sentence in enumerate(test_sentences, 1):
        print(f"\n{i}. English: {sentence}")

        # Greedy decoding
        translation_greedy = translate_sentence(
            model, sentence, word2idx_en, word2idx_hing, idx2word_hing, device=device
        )
        print(f"   Hinglish : {translation_greedy}")



# Test sentences
test_sentences = [
   "i can do anything",
   "i like to play cricket",
   "who are you ",
   "I like to walk"


]

# Run tests
print("Testing the trained model...")
test_model(model, test_sentences, word2idx_en, word2idx_hing, idx2word_hing, device=device)


Testing the trained model...

1. English: i can do anything
   Hinglish : mai kuch bhi kar sakta hu

2. English: i like to play cricket
   Hinglish : mujhe cricket khelne ke like pasand hai

3. English: who are you 
   Hinglish : aap kon ho?

4. English: I like to walk
   Hinglish : mujhe walk karna hai


In [71]:
def interactive_test():
    """
    Interactive testing where user can input sentences
    """
    print("\n" + "=" * 80)
    print("INTERACTIVE TESTING")
    print("=" * 80)
    print("Enter English sentences to translate (type 'quit' to exit):")

    while True:
        sentence = input("\nEnglish: ").strip()
        if sentence.lower() == 'quit':
            break

        if sentence:
            translation = translate_sentence(
                model, sentence, word2idx_en, word2idx_hing, idx2word_hing, device=device
            )
            print(f"Hinglish: {translation}")


interactive_test()



INTERACTIVE TESTING
Enter English sentences to translate (type 'quit' to exit):

English: who are you
Hinglish: aap kon ho?

English:  i want to watch this movie
Hinglish: muje is movie ko watch karna chahte hai

English: i can do anything
Hinglish: mai kuch bhi kar sakta hu

English: i like to walk
Hinglish: mujhe walk karna hai

English: i like to play cricket
Hinglish: mujhe cricket khelne ke like pasand hai

English:  this is magic
Hinglish: ye to <UNK> hai

English:  this is my friend
Hinglish: ye meri friend <UNK> hai

English: this is good 
Hinglish: ye achha hai

English: but not too much
Hinglish: lekin itna bhi nahi

English: but i like it
Hinglish: lekin muje pasand hein

English: not too bad
Hinglish: bura nahin

English: this is best action film
Hinglish: ye best action film hai

English: this is worse action movie
Hinglish: ye kya ye sab film hai

English: quit
