In [88]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
import re
import time
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv


In [89]:
data=pd.read_csv('/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv')

In [90]:
data.shape

(127607, 3)

In [91]:
data=data.drop('source',axis=1)
data

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...
127602,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
127603,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127604,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
127605,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [92]:
unwanted_idx=[]
x=re.compile(r'\d+')
for idx,cols in data.iterrows():
    try:
        if x.match(cols['english_sentence']):
            unwanted_idx.append(idx)
    except:
        print(idx)

37554
59804


In [93]:
data=data.drop(unwanted_idx + [37554,59804],axis=0)

In [94]:
eng_d={}
for text in data.english_sentence:
    l=len(text.split(' '))
    eng_d.setdefault(l,0)
    eng_d[l]+=1

In [95]:
eng_dic={k:v for k,v in sorted(eng_d.items(),key=lambda x: x[1],reverse=True)}

In [96]:
eng_dic[10]

6044

In [97]:
hindi_d={}
for text in data.hindi_sentence:
    l=len(text.split(' '))
    hindi_d.setdefault(l,0)
    hindi_d[l]+=1

In [98]:
hindi_dic={k:v for k,v in sorted(hindi_d.items(),key=lambda x: x[1],reverse=True)}

In [99]:
hindi_dic[10]

6068

In [100]:
unwanted_len_id=[]
for idx,cols in data.iterrows():
    if len(cols.english_sentence.split(' ')) > 10 or len(cols.hindi_sentence.split(' ')) > 10:
        unwanted_len_id.append(idx)

In [101]:
data=data.drop(unwanted_len_id,axis=0)

In [102]:
data.shape

(45596, 2)

In [103]:
train=data.iloc[:25000]
val=data.iloc[25000:]

In [104]:
train.shape,val.shape

((25000, 2), (20596, 2))

In [105]:
train=train.to_numpy()
val=val.to_numpy()

In [106]:
english=train[:,0]
hindi=train[:,1]

In [107]:
def preprocess_and_tokenize(language):
    pattern=r'[!"#$%&()*,+-./:;\[\]<=>?@\\^_`{|}~\t\n\d“”]'
    language=tf.strings.lower(language)
    language=tf.strings.regex_replace(language,pattern,'')
    language=tf.strings.strip(language)
    
    lang=[]
    for text in language:
        lang.append('<sos> '+ text.numpy().decode('utf-8') + ' <eos>')
            
    lang=np.array(lang)
    tokenizer=keras.preprocessing.text.Tokenizer(filters='',split=' ')
    tokenizer.fit_on_texts(lang)
    tensor=tokenizer.texts_to_sequences(lang)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post',value=0)

    return tensor,tokenizer

In [108]:
eng_tokenized,eng_tokenizer=preprocess_and_tokenize(english)

In [109]:
hindi_tokenized,hindi_tokenizer=preprocess_and_tokenize(hindi)

In [110]:
dataset=tf.data.Dataset.from_tensor_slices((tf.Variable(eng_tokenized),tf.Variable(hindi_tokenized))).shuffle(10000).batch(128).prefetch(1)

In [111]:
class Encoder(keras.Model):
    def __init__(self,vocab_size=10000,emb_dim=128,units=256,batch_size=64):
        super(Encoder,self).__init__()
        self.units = units
        self.batch = batch_size
        self.emb_layer = keras.layers.Embedding(vocab_size,emb_dim)
        self.lstm = keras.layers.LSTM(self.units,return_sequences=True,return_state=True)
        
    def call(self,x,states):
        emb=self.emb_layer(x)
        output,hidden,carry=self.lstm(emb,initial_state=states)
        return output,hidden,carry
    
    def init_hidden_state(self):
        return tf.zeros((self.batch,self.units)),tf.zeros((self.batch,self.units))

In [112]:
class Decoder(keras.Model):
    def __init__(self,vocab_size=10000,emb_dim=128,units=256,batch_size=64):
        super(Decoder,self).__init__()
        self.units = units
        self.batch = batch_size
        self.emb_layer = keras.layers.Embedding(vocab_size,emb_dim)
        self.lstm = keras.layers.LSTM(self.units,return_sequences=True,return_state=True)
        self.fc=keras.layers.Dense(vocab_size)
        
    def call(self,x,states):
        emb=self.emb_layer(x)
        output,hidden,carry=self.lstm(emb,initial_state=states)
        output=self.fc(output)
        return output,hidden,carry

In [113]:
optimizer=keras.optimizers.Adam()
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
accuracy=keras.metrics.SparseCategoricalAccuracy()
def loss_fn(true,pred):
    mask = tf.math.logical_not(tf.math.equal(true, 0))
    loss_=loss(true,pred)
    mask=tf.cast(mask,dtype=loss_.dtype)
    loss_*=mask
    return tf.reduce_mean(loss_)

def update_accuracy(true,pred):
    accuracy.update_state(true,pred)
    
def get_accuracy():
    accuracy_=accuracy.result().numpy()
    return accuracy_

In [114]:
enc_vocab_size=len(eng_tokenizer.index_word) + 1
dec_vocab_size=len(hindi_tokenizer.index_word) + 1

In [115]:
encoder=Encoder(enc_vocab_size,256,512,128)
decoder=Decoder(dec_vocab_size,256,512,128)

In [116]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [129]:
eng_tokenizer.index_word[0]='<pad>'
eng_tokenizer.word_index['<pad>']=0
hindi_tokenizer.index_word[0]='<pad>'
hindi_tokenizer.word_index['<pad>']=0

In [130]:
@tf.function
def train(input,target,enc_hidden):
    loss__=0.0
    with tf.GradientTape() as tape:
        enc_output,enc_h,enc_c=encoder(input,enc_hidden)
        enc_states=[enc_h,enc_c]
        dec_input=tf.expand_dims(target[:,0],1)
        
        for t in range(1,target.shape[1]):
            dec_output,_,_=decoder(dec_input,enc_states)
            loss__+=loss_fn(target[:,t],dec_output)
            #update_accuracy(target[:,t],dec_output)
            dec_input = tf.expand_dims(target[:, t], 1)
        
    batch_loss=loss__/int(target.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients=tape.gradient(loss__,variables)
        
    optimizer.apply_gradients(zip(gradients,variables))

    return batch_loss

In [131]:
epochs=50
steps_per_epoch=25000//128 #batches
for epoch in range(epochs):
    start=time.time()
    enc_hidden=encoder.init_hidden_state()
    total_loss=0.0
    for (batch,(inp,tar)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss=train(inp,tar,enc_hidden)
        total_loss+=batch_loss
    #print('Epoch {}      Accuray {:.2f}   Avg. Loss {:.4f}'.format(epoch + 1,float(accuracy.result()),total_loss/steps_per_epoch),end='  ')
    print('Epoch {}      Avg. Loss {:.4f}'.format(epoch + 1,total_loss/steps_per_epoch),end='  ')
    print('Time Taken: {:.1f} sec'.format(time.time() - start))
    accuracy.reset_states()

Epoch 1      Avg. Loss 3.3055  Time Taken: 30.0 sec
Epoch 2      Avg. Loss 2.9125  Time Taken: 16.4 sec
Epoch 3      Avg. Loss 2.8070  Time Taken: 16.9 sec
Epoch 4      Avg. Loss 2.7082  Time Taken: 15.9 sec
Epoch 5      Avg. Loss 2.6234  Time Taken: 17.9 sec
Epoch 6      Avg. Loss 2.5406  Time Taken: 16.0 sec
Epoch 7      Avg. Loss 2.4434  Time Taken: 16.8 sec
Epoch 8      Avg. Loss 2.3498  Time Taken: 16.5 sec
Epoch 9      Avg. Loss 2.2626  Time Taken: 17.2 sec
Epoch 10      Avg. Loss 2.1860  Time Taken: 15.9 sec
Epoch 11      Avg. Loss 2.1133  Time Taken: 16.6 sec
Epoch 12      Avg. Loss 2.0366  Time Taken: 16.7 sec
Epoch 13      Avg. Loss 1.9648  Time Taken: 17.1 sec
Epoch 14      Avg. Loss 1.8932  Time Taken: 16.4 sec
Epoch 15      Avg. Loss 1.8248  Time Taken: 16.8 sec
Epoch 16      Avg. Loss 1.7535  Time Taken: 16.8 sec
Epoch 17      Avg. Loss 1.6852  Time Taken: 16.7 sec
Epoch 18      Avg. Loss 1.6219  Time Taken: 16.0 sec
Epoch 19      Avg. Loss 1.5596  Time Taken: 17.2 sec
Ep

In [188]:
def predict(input):
    hidden=[tf.zeros((1,512)),tf.zeros((1,512))]
    _,enc_h,enc_c=encoder(input,hidden)
    enc_states=[enc_h,enc_c]
    result=[]
    dec_input = tf.expand_dims(input[:,0], 0)
    for t in range(input.shape[1]):
        dec_output,_,_=decoder(dec_input,enc_states)
        output_id=tf.math.argmax(dec_output[0],-1)
        output_id=output_id[0].numpy()
        if output_id == hindi_tokenizer.word_index['<eos>']:
            return ' '.join(result)
        dec_input = tf.expand_dims([output_id], 0)
        result.append(hindi_tokenizer.index_word[output_id])

In [166]:
def preprocess_input(text):
    pattern=r'[!"#$%&()*,+-./:;\[\]<=>?@\\^_`{|}~\t\n\d“”]'
    text=tf.strings.lower(text)
    text=tf.strings.regex_replace(text,pattern,'')
    text=tf.strings.strip(text)
    return text.numpy().decode("utf-8") 

In [189]:
def get_translation(input_text):
    length=12
    eng_processed=['<sos>',]
    for i in preprocess_input(input_text).split(' '):
        if i != '':
            eng_processed.append(i)
    eng_processed.append('<eos>')
    eng_tokenized=[eng_tokenizer.word_index[i] for i in eng_processed]
    eng_tokenized=tf.keras.preprocessing.sequence.pad_sequences([eng_tokenized],maxlen=length,padding='post',value=0)
    hindi_pred=predict(eng_tokenized)
    print('Pred. Hindi: ',hindi_pred,end='\n\n')

In [192]:
for i in [1,3,10,15,18,19,21,23,24,26,28,33,34,36,38,45 ]:
    print('No. ',i)
    print('English:     ',english[i])
    print('Hindi:       ',hindi[i])
    get_translation(str(english[i]))

No.  1
English:      .The ending portion of these Vedas is called Upanishad.
Hindi:        इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
Pred. Hindi:  इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।

No.  3
English:      Category: Religious Text
Hindi:        श्रेणी:धर्मग्रन्थ
Pred. Hindi:  श्रेणीधर्मग्रन्थ

No.  10
English:      Maine
Hindi:        मेन
Pred. Hindi:  मेन

No.  15
English:      category:information technology
Hindi:        श्रेणी:सूचना प्रौद्योगिकी
Pred. Hindi:  श्रेणीसूचना प्रौद्योगिकी

No.  18
English:      Aryans did not make any statues or temples for deities.
Hindi:        आर्य देवताओं की कोई मूर्ति या मन्दिर नहीं बनाते थे।
Pred. Hindi:  आर्य देवताओं की कोई मूर्ति या मन्दिर नहीं बनाते थे।

No.  19
English:      .Sarojini Naidu with Mahatma Gandhi
Hindi:        महात्मा गांधी के साथ सरोजिनी नायडू
Pred. Hindi:  महात्मा गांधी के साथ सरोजिनी नायडू

No.  21
English:      External links
Hindi:        बाहरी कड़ियाँ
Pred. Hindi:  बाहरी कड़ियाँ

No.  23
English:      This change h

In [193]:
checkpoint.save(file_prefix = checkpoint_prefix)

'./training_checkpoints/ckpt-1'