<a href="https://colab.research.google.com/github/pradeep2c1/Machine-Translation-model/blob/main/NMT_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import numpy as np
import pandas as pd
import re

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Cleaning the data

In [None]:
lines = pd.read_csv("Hindi_English_Truncated_Corpus.csv", encoding='utf-8')

In [None]:
lines['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [None]:
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
5,tides,The then Governor of Kashmir resisted transfer...,कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का व...
6,indic2012,In this lies the circumstances of people befor...,इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
8,indic2012,“”Global Warming“” refer to warming caused in ...,ग्लोबल वॉर्मिंग से आशय हाल ही के दशकों में हुई...
9,tides,You may want your child to go to a school that...,हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटे...


In [None]:
lines = lines[lines['source'] == 'ted']
lines.shape

(39881, 3)

In [None]:
# df = pd.read_csv("data.csv", encoding='utf-8')
# df.tail()

In [None]:
# df.shape

In [None]:
# df.rename({'eng': 'english_sentence', 'hindi': 'hindi_sentence'}, axis = 1, inplace = True)
# lines = df

In [None]:
lines.drop(columns = ['source'], inplace = True)
# lines = pd.concat([lines, df])
# lines.shape

In [None]:
lines.reset_index(inplace = True)
lines.drop(columns = ['index'], inplace = True)

In [None]:
pd.isnull(lines).sum()

english_sentence    0
hindi_sentence      0
dtype: int64

In [None]:
lines = lines[~pd.isnull(lines['english_sentence'])]
lines.drop_duplicates(inplace=True)
print(lines.shape)
lines.head()

(38803, 2)


Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
3,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
4,So there is some sort of justice,तो वहाँ न्याय है


In [None]:
# Make all english letters lowercase
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.lower())

# Remove the quotes
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

# Remove special characters
sp_char = set(string.punctuation)
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in sp_char))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in sp_char))

lines = lines.sample(frac = 1).reset_index(drop = True)
lines.head()

Unnamed: 0,english_sentence,hindi_sentence
0,because of our tenuous position,हमारी नाज़ुक स्थिति के कारण
1,its limited by our cognitive biases,ये बंधा हुआ है हमारे दिमागी पक्षपात से
2,meaning share whats relevant to the audience,मतलब कि जो श्रोता के प्रांसगिक हैं वो बताईये
3,its that theyre helping us to be more human,बल्कि वे तो हमें और अधिक मानवीय बनने
4,that we need to do together,जो हमें मिल कर करनी चाहिए


# Tokenizing the sentences

In [None]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

# Data pre-processing

In [None]:
english_sentences = [line for line in lines['english_sentence']]
hindi_sentences = [line for line in lines['hindi_sentence']]

hin_text_tokenized, hin_text_tokenizer = tokenize(hindi_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print(f'Maximum length hindi sentence: {len(max(hin_text_tokenized, key = len))}')
print(f'Maximum length english sentence: {len(max(eng_text_tokenized, key = len))}')

# Check language length
hindi_vocab = len(hin_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print(f"Hindi vocabulary is of {hindi_vocab} unique words")
print(f"English vocabulary is of {english_vocab} unique words")

Maximum length hindi sentence: 30
Maximum length english sentence: 21
Hindi vocabulary is of 22773 unique words
English vocabulary is of 17760 unique words


## Adding padding to make the maximum length of the sentences in each language equal.

In [None]:
max_hindi_len = int(len(max(hin_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

hin_pad_sentence = pad_sequences(hin_text_tokenized, max_hindi_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
hin_pad_sentence = hin_pad_sentence.reshape(*hin_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

# Making the encoder-decoder architecure

In [None]:
input_sequence = Input(shape = (max_hindi_len,))
embedding = Embedding(input_dim = hindi_vocab, output_dim = 128,)(input_sequence)
encoder = LSTM(64, return_sequences = False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences = True, dropout = 0)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)

# Create the model

In [None]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss = sparse_categorical_crossentropy,
              optimizer = Adam(1e-3),
              metrics = ['accuracy'])
enc_dec_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 30)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 30, 128)           2914944   
                                                                 
 lstm_8 (LSTM)               (None, 64)                49408     
                                                                 
 repeat_vector_4 (RepeatVect  (None, 21, 64)           0         
 or)                                                             
                                                                 
 lstm_9 (LSTM)               (None, 21, 64)            33024     
                                                                 
 time_distributed_4 (TimeDis  (None, 21, 17760)        1154400   
 tributed)                                                 

In [None]:
model_results = enc_dec_model.fit(hin_pad_sentence, eng_pad_sentence, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Check the translation of the model

In [None]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 1000
print(f"The english sentence is: {english_sentences[index]}")
print(f"The hindi sentence is: {hindi_sentences[index]}")
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(hin_pad_sentence[index:index + 1])[0], eng_text_tokenizer))

The english sentence is: it is easier than you think it really is
The hindi sentence is: जितना आप सोचते हैं यह उससे आसान है वाकई आसान है
The predicted sentence is :
and the the to to to the <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>


In [117]:
index = 1000
k = enc_dec_model.predict(hin_pad_sentence[index:index + 1])[0]
x = np.argmax(k, 1)
x



array([2, 1, 1, 3, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])