## Encoder-Decoder Network for neural Machine Translation

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("Dataset_English_Hindi.csv")

In [None]:
df.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [None]:
df.tail()

Unnamed: 0,English,Hindi
130471,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
130473,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
130474,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .
130475,They've just won four government contracts to ...,हाल ही में उन्हें सरकारी ठेका मिला है करीब सौ ...


In [None]:
df.describe()

Unnamed: 0,English,Hindi
count,130474,130164
unique,126959,100228
top,(Laughter),(हँसी)
freq,555,212


In [None]:
df.isnull().sum()

Unnamed: 0,0
English,2
Hindi,312


In [3]:
df_cleaned = df.dropna(axis=0)

In [4]:
df_cleaned.isnull().sum()

Unnamed: 0,0
English,0
Hindi,0


In [5]:
df_final = df_cleaned.sample(frac=1).reset_index(drop=True)

In [6]:
en_sentence = df_final["English"]

In [7]:
en_sentence

Unnamed: 0,English
0,"In many places , the flocks are shorn twice a ..."
1,"Anyway, Johnny had just gotten through having ..."
2,But we had no money to support them till a cor...
3,Jawaharlal Neharu got his eduction from wolrd'...
4,and contribute to the evolutionary pressures
...,...
130157,I'm here today to talk
130158,The death of this devoted friend was a deeply ...
130159,So I decided I'm going to sell this new machine
130160,It was a stroke of bad luck that just before t...


In [8]:
hin_sentence = df_final["Hindi"]

In [9]:
hin_sentence

Unnamed: 0,Hindi
0,अनेक स्थानों पर तो भेड़ों की ऊन वर्ष में दो बा...
1,"खैर, जॉनी अभी-अभी उसके साथ सहवास को अंजाम दे च..."
2,लेकिन जब तक एक कंपनी ने हमें इससे उबारा नहीं त...
3,जवाहरलाल नेहरू ने दुनिया के कुछ बेहतरीन स्कूलो...
4,और विकास के इस दवाब में योगदान कर सकें..
...,...
130157,आज मैं यहाँ कहना आया हूँ
130158,वे कवि के घनिष्ठ और अंतरंग मित्र थे.उनका निधन ...
130159,तो मैने अब तय किया है कि इस नए मशीन को
130160,यह दुर्भागऋ-ऊण्श्छ्ष्-य था कि कांग्रेस अधिवेशन...


In [None]:
for i in range(3):
  print(en_sentence[i],"->",hin_sentence[i])

Also it goes to the parts of New Mumbai and Thane. -> साथ ही नवी मुंबई एवं ठाणे के भी भाग तक जातीं हैं।
4. Hindutvas aim is above heaven and hell -> 4. हिन्दुत्व का लक्ष्य स्वर्ग-नरक से ऊपर
All such efforts , however , must be directed towards the realisation of the plan we have drawn up for a free India . -> ये सारी कोशिशें उस योजना को सफल बनाने के लिए की जानी चाहिए , जो हमने आजाद हिंदुस्तान के एइल तैयार की हैं .


In [10]:
vocab_size = 2000
max_len = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_len
)
text_vec_layer_hin = tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_len
)
text_vec_layer_en.adapt(en_sentence)
text_vec_layer_hin.adapt([f"startofseq {s} endofseq" for s in hin_sentence])

In [11]:
vocab_en = text_vec_layer_en.get_vocabulary()
vocab_en = [str(word) for word in vocab_en]
print(vocab_en[:10])

['', '[UNK]', 'the', 'of', 'and', 'to', 'in', 'a', 'is', 'that']


In [12]:
vocab_hin = text_vec_layer_hin.get_vocabulary()
vocab_hin = [str(word) for word in vocab_hin]
print(vocab_hin[:10])

['', '[UNK]', 'startofseq', 'endofseq', 'के', 'में', 'है', 'की', 'और', 'से']


In [13]:
X_train_enc = tf.constant(en_sentence[:110_000])
X_valid_enc = tf.constant(en_sentence[110_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in hin_sentence[:110_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in hin_sentence[110_000:]])
y_train = text_vec_layer_hin([f"{s} endofseq" for s in hin_sentence[:110_000] ])
y_valid = text_vec_layer_hin([f"{s} endofseq" for s in hin_sentence[110_000:] ])

In [14]:
encoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[],dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_hin(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
encoder_embeddings  = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512,return_state=True, use_cudnn=False)
encoder_outputs,*encoder_states = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512,return_sequences=True, use_cudnn=False)
decoder_outputs = decoder(decoder_embeddings,initial_state=encoder_states)

output_layer = tf.keras.layers.Dense(vocab_size,activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [None]:
model = tf.keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=[Y_proba])

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(
    (X_train_enc,X_train_dec),y_train,
    epochs=10,
    validation_data=((X_valid_enc,X_valid_dec),y_valid)
)

Epoch 1/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m719s[0m 208ms/step - accuracy: 0.0987 - loss: 4.3939 - val_accuracy: 0.1130 - val_loss: 3.6935
Epoch 2/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 214ms/step - accuracy: 0.1164 - loss: 3.5431 - val_accuracy: 0.1211 - val_loss: 3.3221
Epoch 3/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m723s[0m 208ms/step - accuracy: 0.1259 - loss: 3.1626 - val_accuracy: 0.1265 - val_loss: 3.1368
Epoch 4/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m759s[0m 213ms/step - accuracy: 0.1338 - loss: 2.9173 - val_accuracy: 0.1299 - val_loss: 3.0353
Epoch 5/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 215ms/step - accuracy: 0.1411 - loss: 2.7222 - val_accuracy: 0.1326 - val_loss: 2.9733
Epoch 6/10
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 215ms/step - accuracy: 0.1483 - loss: 2.5669 - val_accuracy: 0.1343 - val_loss:

In [None]:
def translate(sentence_en):
    translation = ""

    # Encode the input sentence as tf.string tensor
    X_enc = tf.constant([sentence_en])

    for _ in range(max_len):
        X_dec = tf.constant([f"startofseq {translation}"])

        y_pred = model.predict((X_enc, X_dec), verbose=0)  # shape: (1, max_len, vocab_size)
        predicted_id = np.argmax(y_pred[0, len(translation.split()), :])
        predicted_word = text_vec_layer_hin.get_vocabulary()[predicted_id]

        if predicted_word == "endofseq":
            break

        translation += " " + predicted_word

    return translation.strip()


In [None]:
translate("I am happy")

'मुझे [UNK] है'

In [None]:
"खुश" in text_vec_layer_hin.get_vocabulary()


True

## Bidirectional RNNs

In [26]:
class ConcatStatesLayer(tf.keras.layers.Layer):
    def call(self, states):
        return [tf.concat(states[::2], axis=-1), tf.concat(states[1::2], axis=-1)]

encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True)
)
encoder_outputs, *encoder_states = encoder(encoder_embeddings)
encoder_state = ConcatStatesLayer()(encoder_states)


In [27]:
decoder = tf.keras.layers.LSTM(1024,return_sequences=True)
decoder_outputs = decoder(decoder_embeddings,initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size,activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=[Y_proba])

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(
    (X_train_enc,X_train_dec),y_train,
    epochs=10,
    validation_data=((X_valid_enc,X_valid_dec),y_valid)
)