## Setting up the imports for the project

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import pandas as pd
import re
import numpy as np
import unicodedata
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [36]:
# Reading the data, and printing the head
data = pd.read_csv("hindi_english_parallel.csv")
data.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1561841 entries, 0 to 1561840
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   hindi    1555784 non-null  object
 1   english  1560964 non-null  object
dtypes: object(2)
memory usage: 23.8+ MB


# List of preprocessing tasks
 - Lowercase the english letters
 - Check and remove null rows
 - CHeck if hindi is in english data
 - Use Regex to remove characters like @, #, $, %.
 - convert all the hindi numerals to integers
 - Unicode Normalization
 - 

In [38]:
# convert all the characters of english to lowercase
data["english"] = data["english"].apply(lambda x: str(x).lower())

In [39]:
# Check if any null values
data.isna().sum()   # no row is empty

hindi      6057
english       0
dtype: int64

In [40]:
# Dropping the NA values
data.dropna(inplace=True)
data.isna().sum()

hindi      0
english    0
dtype: int64

In [41]:
data["english"]   # this shows that there are hindi characters in english column, we need to drop those rows

0             give your application an accessibility workout
1                          accerciser accessibility explorer
2             the default plugin layout for the bottom panel
3                the default plugin layout for the top panel
4             a list of plugins that are disabled by default
                                 ...                        
1561835                  शपथ लेने/प्रतिज्ञान करने वाले सदस्य
1561836    स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...
1561837    है। i note that this is a landmark meeting – n...
1561838    है। in the presentations that they made before...
1561839    ्त है। issues such as food and water security;...
Name: english, Length: 1555784, dtype: object

In [42]:
# regex checking of characters [\u0900-\u097F]+ will tell us all the rows in english column that have hindi character
# return np.nan for rows that contain hindi, and drop them
# also .apply doesn't do permanent changes, good for looking

def preprocess_english(line):
     # if it contains letters of hindi
    if re.search("[\u0900-\u097F]+", line):  
        return np.nan
    else:
        line = re.sub("[^\w\s\.\?!\।]", "", line)   # remove unnecessary puncuations
        line = unicodedata.normalize("NFKC", line)   # normalize unicode
        line = re.sub(r"\s+", " ", line).strip()   # remove extra whitespaces
        line = f"{line} </s> <2en>"
        return line
        
data["english"] = data["english"].apply(preprocess_english)
data.dropna(inplace=True)

In [9]:
data   # as we can see the hindi in english column has been removed

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout...
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer </s> <2en>
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,the default plugin layout for the bottom panel...
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,the default plugin layout for the top panel </...
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,a list of plugins that are disabled by default...
...,...,...
1561828,जोनलः रू. 0.50 लाख के प्रत्येक वार्षिक पुरस्का...,zonal eleven annual awards of 0.50 lakh each e...
1561829,इस्लाम से पहले करोड़ों लोगों के इस पारसी धर्म ...,zoroastrianism which was once the religion of ...
1561830,"(य) ""प्रतिभूतिकरण"" से किसी प्रतिभूतिकरण कंपनी ...",z securitisation means acquisition of financia...
1561831,राष्ट्रपति जी ने कहा कि जुबिन मेहता संगीत को श...,zubin mehta has engaged in untiring efforts to...


In [10]:
def preprocess_hindi(line):
    
    # Unicode Normalisation
    line = unicodedata.normalize("NFKC", line)

    # removing the unnecessary punctuations
    # This pattern says: 
    # "Find everything that is NOT a space, NOT a word character, 
    # AND NOT a character in the Devanagari Unicode range (U+0900 to U+097F)"
    line = re.sub(r'[^\s\w\u0900-\u097F\.\?!\।]', '', line)
    
    # add space before ending puncuations to make them seperate tokens
    # Space out ?, !, and Hindi Purna Viram
    line = re.sub(r"([\?!\।])", r" \1 ", line)
    # Space out dots only if they aren't decimals
    line = re.sub(r"(?<!\d)\.(?!\d)", r" . ", line)
    
    # remove extra whitespaces
    line = re.sub(r"\s+", " ", line).strip()
    
    # Start and end block for decoder with space (for seperate tokenisation) according to Indic bart
    line = f"<2hi> {line} </s>"

    return line
    
data["hindi"] = data["hindi"].apply(preprocess_hindi)
data["hindi"]

0          <2hi> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का ...
1                 <2hi> एक्सेर्साइसर पहुंचनीयता अन्वेषक </s>
2            <2hi> निचले पटल के लिए डिफोल्ट प्लगइन खाका </s>
3             <2hi> ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका </s>
4          <2hi> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप ...
                                 ...                        
1561828    <2hi> जोनलः रू . 0.50 लाख के प्रत्येक वार्षिक ...
1561829    <2hi> इस्लाम से पहले करोड़ों लोगों के इस पारसी...
1561830    <2hi> य प्रतिभूतिकरण से किसी प्रतिभूतिकरण कंपन...
1561831    <2hi> राष्ट्रपति जी ने कहा कि जुबिन मेहता संगी...
1561832    <2hi> जुबिन मेहता के संगीत में सीमाओं के आरपार...
Name: hindi, Length: 1553212, dtype: object

In [30]:
data

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,the default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,the default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,a list of plugins that are disabled by default
...,...,...
1561835,Members making oath/affirmation,शपथ लेने/प्रतिज्ञान करने वाले सदस्य
1561836,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। i note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। in the presentations that they made before...


In [32]:
type(data.iloc[0].hindi)

str

## Checking the lengths of tokens

In [25]:
"""
data['en_token_len'] = data['english'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)))
en_lengths = data['en_token_len'].tolist()
max_len_95 = np.percentile(en_lengths, 95)
print(f"95% of sentences are shorter than: {max_len_95}")

Output: 95% of sentences are shorter than: 60.0
"""

"""
data['hi_token_len'] = data['hindi'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)))
hi_lengths = data['hi_token_len'].tolist()
max_len_95 = np.percentile(hi_lengths, 95)
print(f"95% of sentences are shorter than: {max_len_95}")

Output: 95% of sentences are shorter than: 84.0
"""

'\ndata[\'hi_token_len\'] = data[\'hindi\'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)))\nhi_lengths = data[\'hi_token_len\'].tolist()\nmax_len_95 = np.percentile(hi_lengths, 95)\nprint(f"95% of sentences are shorter than: {max_len_95}")\n\nOutput: 95% of sentences are shorter than: 84.0\n'

## Lets make our custom dataset

In [33]:
class custom_loader(Dataset):

    # constructor function
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    # get each item, it takes an index as per the length of the dataset
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        english_sentence, hindi_sentence = row.english, row.hindi
        
        source = self.tokenizer(english_sentence, add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=self.max_len, truncation=True)
        target = self.tokenizer(hindi_sentence, add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=self.max_len, truncation=True)
        
        return {
            "input_ids": source_ids.squeeze(),         # English IDs (Encoder)
            "attention_mask": source_mask.squeeze(),   # Tells model to ignore padding
            "labels": target_ids.squeeze(),            # Hindi IDs (The Ground Truth)
        }

    # gives the length of dataset, given by the number of rows in our dataframe
    def __len__(self):
        return len(df)
        

In [43]:
import torch

def test_translation(english_text, model, tokenizer, device):
    # 1. Format the input strictly for IndicBART
    input_text = f"{english_text} </s> <2en>"
    
    # 2. Tokenize and move to device
    inputs = tokenizer(input_text, add_special_tokens=False, return_tensors="pt").to(device)
    
    # 3. Get the ID for the Hindi tag to tell the decoder where to start
    # Using the method from the sample code
    hindi_tag_id = tokenizer._convert_token_to_id_with_added_voc("<2hi>")
    
    # 4. Generate!
    model.eval() # Important: Turn off dropout
    with torch.no_grad():
        output_ids = model.generate(
            inputs.input_ids,
            decoder_start_token_id=hindi_tag_id,
            max_length=50,
            num_beams=5,           # Search for the best 5 word combinations
            early_stopping=True
        )
    
    # 5. Decode back to Hindi text
    translation = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    print(f"English: {english_text}")
    print(f"Hindi  : {translation}")

# --- RUN THE TEST ---
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBARTSS")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_sentence = "This is a machine learning project."
test_translation(test_sentence, model, tokenizer, device)

Loading weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 267/267 [00:00<00:00, 381.49it/s, Materializing param=model.shared.weight]


English: This is a machine learning project.
Hindi  : <2hi> this is a machine learning project.</s>
