### Encoder Decoder

Reference: https://www.kaggle.com/code/aiswaryaramachandran/english-to-hindi-neural-machine-translation/notebook

In [64]:
import numpy as np 
import pandas as pd
import re
import string
from string import digits, punctuation

In [65]:
data = pd.read_csv(r"Data/Hindi_English_Truncated_Corpus.csv")
print(data.shape)

(127607, 3)


In [66]:
data['source'].value_counts()

source
tides        50000
ted          39881
indic2012    37726
Name: count, dtype: int64

In [67]:
ted_data = data[data["source"] == "ted"]
print(ted_data.shape)

(39881, 3)


In [68]:
ted_data.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
23,ted,This changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"And you can see, this LED is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,to turn on the lights or to bring him a glass ...,"लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,Can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [69]:
#Check for null values
ted_data.isnull().sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [70]:
#Check for duplicate values
ted_data.duplicated().sum()

1078

In [71]:
#drop duplicates
ted_data.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ted_data.drop_duplicates(inplace=True)


In [72]:
ted_sample = ted_data.sample(n=25000, random_state=42)

In [73]:
#Convert english and hindi sentences to lowercase
ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: x.lower())

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: x.lower())

In [74]:
#Remove quotes and replace with space
ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: re.sub("'","",x))

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: re.sub("'","",x))


In [75]:
#Remove special characters and punctuations
exclude_punc = set(punctuation)

ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: ''.join(ch for ch in x if ch not in exclude_punc))

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: ''.join(ch for ch in x if ch not in exclude_punc))

In [76]:
#Remove numbers
remove_digits = str.maketrans('','',digits)

ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: x.translate(remove_digits))

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: x.translate(remove_digits))


In [77]:
#Remove hindi digits
ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: re.sub("[२३०८१५७९४६]","",x))


In [78]:
#Removing extra spaces
ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: x.strip())

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: x.strip())

ted_sample.english_sentence = ted_sample.english_sentence.apply(lambda x: re.sub(" +"," ",x))

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: re.sub(" +"," ",x))

In [79]:
#Add START and END tokens to the target sequence

ted_sample.hindi_sentence = ted_sample.hindi_sentence.apply(lambda x: "START_ "+x+" _END")

In [80]:
#Get english and hindi vocabulary

def get_all_words(data):
    all_words = set()
    for line in data:
        for word in line.split():
            if word not in all_words:
                all_words.add(word)
    
    return all_words

In [81]:
all_eng_words = get_all_words(ted_sample.english_sentence)
all_hin_words = get_all_words(ted_sample.hindi_sentence)

print(f"English words: {len(all_eng_words)}")
print(f"Hindi words: {len(all_hin_words)}")

English words: 14030
Hindi words: 17540


In [82]:
#Finding length of english and hindi sentences
ted_sample['len_english_sentence'] = ted_sample.english_sentence.apply(lambda x: len(x.split(" ")))

ted_sample['len_hindi_sentence'] = ted_sample.hindi_sentence.apply(lambda x: len(x.split(" ")))

In [84]:
print(f"Max length of input (english) sentence: {max(ted_sample['len_english_sentence'])}")

print(f"Max length of output (hindi) sentence: {max(ted_sample['len_hindi_sentence'])}")

Max length of input (english) sentence: 20
Max length of output (hindi) sentence: 30


In [85]:
#Keeping data with max 20 chars only
ted_sample=ted_sample[ted_sample['len_english_sentence']<=20]

ted_sample=ted_sample[ted_sample['len_hindi_sentence']<=20]

In [87]:
#Storing the vocab and its count for english and hindi sentences

input_words = sorted(list(all_eng_words))

target_words = sorted(list(all_hin_words))

num_encoder_tokens = len(all_eng_words)

num_decoder_tokens = len(all_hin_words)