In [75]:
import tensorflow as tf
import numpy as np
import pandas as pd


In [76]:
df = pd.read_csv("Dataset_English_Hindi.csv")

In [77]:
df.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [78]:
df.tail()

Unnamed: 0,English,Hindi
130471,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
130473,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
130474,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .
130475,They've just won four government contracts to ...,हाल ही में उन्हें सरकारी ठेका मिला है करीब सौ ...


In [79]:
df.describe()

Unnamed: 0,English,Hindi
count,130474,130164
unique,126959,100228
top,(Laughter),(हँसी)
freq,555,212


In [80]:
df.isnull().sum()

Unnamed: 0,0
English,2
Hindi,312


In [81]:
df_cleaned = df.dropna(axis=0)

In [82]:
df_cleaned.isnull().sum()

Unnamed: 0,0
English,0
Hindi,0


In [83]:
df_final = df_cleaned.sample(frac=1).reset_index(drop=True)

In [84]:
en_sentence = df_final["English"]

In [85]:
en_sentence

Unnamed: 0,English
0,"Lavigne branched out from recording music, pur..."
1,became biriani.
2,This pair has two children:daughter shweta and...
3,Lumbini University
4,"which I'm going to define as 30 years, applied..."
...,...
130157,Approximate quantities of water required daily...
130158,In this way constitution is like a living body...
130159,So I would like to say that there are three si...
130160,the residence of aryans is known near northen ...


In [86]:
hin_sentence = df_final["Hindi"]

In [87]:
hin_sentence

Unnamed: 0,Hindi
0,यू ट्यूब विडियो के अवरिल लाविंगने (Avril Lavig...
1,बिरयानी बन गयी |
2,इस दंपती के दो बच्चों: बेटी श्वेता और पुत्र अभ...
3,लुम्वीनी विश्वविद्यालय
4,जिसे मैं उन लॊगॊं के संदर्भ में ३० साल की अवधी...
...,...
130157,चार सप्ताह तक की उम्र के 100 चूजों के लिए प्रत...
130158,इस तरह संविधान एक जीवित शरीर तो है परंतु पूर्ण...
130159,तो मैं कहना चाहूँगा कि पालन करने के लिए तीन सा...
130160,आर्यों का निवास स्थान कैस्पियन सागर के पूर्वी ...


In [88]:
for i in range(3):
  print(en_sentence[i],"->",hin_sentence[i])

Lavigne branched out from recording music, pursuing careers in feature film acting and designing clothes and perfumes. She voiced a character in the animated film, Over the Hedge, in 2006. That same year, she made her on-screen feature film debut in Fast Food Nation. In 2008, Lavigne introduced her clothing line, Abbey Dawn; and in 2009, she released her first perfume, Black Star, which was followed by her second perfume, Forbidden Rose, in 2010.In July 2006, Lavigne married her boyfriend of two years, Deryck Whibley, lead singer and guitarist for Sum 41. -> यू ट्यूब विडियो के अवरिल लाविंगने (Avril Lavigne) गाना girlfriend (Girlfriend) भी अधिक दृश्यों के आरोप में ख़ुद रिफ्रेश मचनिस्म के लिंक है जो एक फंसिते (fansite) अवरिल बंद ऐड्स द्वारा अवरिल लाविगने को समर्पित है लिंक पर क्लिक करने पर हर १५ सेकंड पर गर्लफ्रेंड के विडियो ख़ुद रेलोअद हो जायेंगे अवरिल लाविगने के फेन इन्टरनेट के ब्रोव्स करने परीक्षा के लिए पढ़ाई या सोने के लिए भी के समय इस पेज को खोलने में उत्साहित हैं देखने की अधिक ताक

In [89]:
vocab_size = 2000
max_len = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_len
)
text_vec_layer_hin = tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_len
)
text_vec_layer_en.adapt(en_sentence)
text_vec_layer_hin.adapt([f"startofseq {s} endofseq" for s in hin_sentence])

In [90]:
vocab_en = text_vec_layer_en.get_vocabulary()
vocab_en = [str(word) for word in vocab_en]
print(vocab_en[:10])

['', '[UNK]', 'the', 'of', 'and', 'to', 'in', 'a', 'is', 'that']


In [91]:
vocab_hin = text_vec_layer_hin.get_vocabulary()
vocab_hin = [str(word) for word in vocab_hin]
print(vocab_hin[:10])

['', '[UNK]', 'startofseq', 'endofseq', 'के', 'में', 'है', 'की', 'और', 'से']


In [92]:
X_train_enc = tf.constant(en_sentence[:110_000])
X_valid_enc = tf.constant(en_sentence[110_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in hin_sentence[:110_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in hin_sentence[110_000:]])
y_train = text_vec_layer_hin([f"{s} endofseq" for s in hin_sentence[:110_000] ])
y_valid = text_vec_layer_hin([f"{s} endofseq" for s in hin_sentence[110_000:] ])

In [94]:
encoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_hin(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
encoder_embeddings  = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512,return_state=True)
encoder_outputs,*encoder_states = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512,return_sequences=True)
decoder_outputs = decoder(decoder_embeddings,initial_state=encoder_states)

output_layer = tf.keras.layers.Dense(vocab_size,activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [95]:
model = tf.keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=[Y_proba])

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(
    (X_train_enc,X_train_dec),y_train,
    epochs=10,
    validation_data=((X_valid_enc,X_valid_dec),y_valid)
)