In [6]:
pip install nltk sentence-transformers scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import re
from collections import defaultdict, Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [8]:
# Load dataset
train_df = pd.read_csv("/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/hinge-english-to-hinglish-machine-translation/synthetic-dataset/valid.csv")

# Helper: Basic tokenizer
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip().split()

# Generate n-grams from token list
def get_ngrams(tokens, n):
    return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def merge_repeated_ngrams(ngrams, n):
    if not ngrams:
        return ""

    # Start with the first trigram, split into words
    merged = ngrams[0].split()
    
    for i in range(1, len(ngrams)):
        prev = ngrams[i-1].split()
        curr = ngrams[i].split()
        
        # If first two words of current match last two of previous
        if len(curr) >= n and prev[-(n-1):] == curr[:(n-1)]:
            merged.append(curr[-1])  # Only add the last word
        else:
            merged.extend(curr)

    return " ".join(merged)

In [9]:
# Build n-gram alignment model
class NGramAligner:
    def __init__(self, n):
        self.n = n
        self.translation_table = defaultdict(Counter)

    def train(self, source_sentences, target_sentences):
        for src_sent, tgt_sent in zip(source_sentences, target_sentences):
            src_tokens = tokenize(src_sent)
            tgt_tokens = tokenize(tgt_sent)

            src_ngrams = get_ngrams(src_tokens, self.n)
            tgt_ngrams = get_ngrams(tgt_tokens, self.n)

            for i in range(min(len(src_ngrams), len(tgt_ngrams))):
                self.translation_table[src_ngrams[i]][tgt_ngrams[i]] += 1
 
        # Build final dictionary: for each English n-gram, pick the most common Hinglish translation
        self.translation_dict = {
            eng_ng: tgt_counter.most_common(1)[0][0]
            for eng_ng, tgt_counter in self.translation_table.items()
        }

    def translate(self, sentence):
        tokens = tokenize(sentence)
        ngrams = get_ngrams(tokens, self.n)
        translated = []

        for ng in ngrams:
            if ng in self.translation_dict:
                translated.append(self.translation_dict[ng])
            else:
                translated.append(ng) 

        return merge_repeated_ngrams(translated, self.n)


In [10]:
# Train the model
aligner = NGramAligner(n=3)  
aligner.train(train_df["English"], train_df["Hinglish"])

# Test it on some sentences
sample_sentences = test_df["English"].sample(5).tolist()

print("Translation Examples:")
for sent in sample_sentences:
    print(f"EN: {sent}")
    print(f"HI: {aligner.translate(sent)}\n")
    
print("Input sentence:")
print(aligner.translate("He is a good person"))

Translation Examples:
EN: He raised the heavens and set up everything in balance,

HI: usne heavens ko everything aur jo kuchh everything kiya aur balance sthapit kiya everything in balance

EN: and encourage them to do the same.

HI: and encourage them to do jan jati hai do the same

EN: And you probably all have it, and if you haven't, you need to.

HI: aur shayad aap sabhi men bhi vah hota hai aur agar vah iske lie aapko

EN: In their case, it is included in the capitation fee upto a distance of 5 km. between the Clinic of IMP and IP 's residence.

HI: unke mamle men ise bima chikitsa vyavsayi ek bar sammilit vyavsayi ke clinic aur ip case ke residence ke bich 5 ki km ki distance tak prti case fee

EN: From here the trekkers enter the deep valley and have to cross many small rivulets and huge rocks.

HI: cross many small gahri enter the deep karte hain aur unhen marg men kee chhotechhote nale aur barebare

Input sentence:
he is a treatment ka good a good person
