# Encoder-Decoder Network for Neural Machine Translation

In [2]:
import tensorflow as tf
import json
import numpy as np
import pandas as pd

In [7]:
data = []
file_path = "/kaggle/input/eng-hing/hinglish_upload_v1.json"

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        data.append({
            "English": obj["translation"]["en"],
            "Hinglish": obj["translation"]["hi_ng"]
        })


In [8]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,English,Hinglish
0,What's the name of the movie,film ka kya naam hai
1,"Hi, the rotten tomatoes score is great but the...","namaste, sada hua tomatoes score mahaan hai, l..."
2,Do you think you will like the movie,kya aapako lagata hai ki aapako film pasand aa...
3,What kind of movie is it,yah kis tarah kee philm hai
4,when was the movie made?,film kab banee thee?


In [9]:
df.describe()

Unnamed: 0,English,Hinglish
count,189102,189102
unique,165608,155544
top,cancel all alarms,alarm ko stop kare
freq,64,88


In [10]:
df.isnull().sum()

English     0
Hinglish    0
dtype: int64

In [11]:
df_final = df.sample(frac=1).reset_index(drop=True)

In [12]:
df_final.head()

Unnamed: 0,English,Hinglish
0,what is the humidity like today,aaj humidity kaisi hai
1,What is the foggiest city,foggiest city konsa hai
2,How much time do I have until the next reminde...,Mere paas agle reminder tak kitna time hai ?
3,What is the weather like for Tuesday,Tuesday ke liye mausam kaisa hoga
4,Remind me to pay the internet bill,mujhe internet bill pay karne ke liye yaad dilaye


In [13]:
en_sentence = df_final["English"]
en_length = en_sentence.apply(lambda x:len(str(x).split()))
max_en_length = en_length.max()
max_en_length
     

247

In [14]:
en_sentence

0                           what is the humidity like today
1                                 What is the foggiest city
2         How much time do I have until the next reminde...
3                      What is the weather like for Tuesday
4                        Remind me to pay the internet bill
                                ...                        
189097    remind me monday to make a podiatrist appointment
189098                                       Friday at 8 pm
189099    A venture capital fund set up as a company sha...
189100                         Crying family thread reply .
189101    Olaf could be a bit much sometimes, but had so...
Name: English, Length: 189102, dtype: object

In [16]:
hing_sentence = df_final["Hinglish"]
hing_length = hing_sentence.apply(lambda x:len(str(x).split()))
max_hing_length = hing_length.max()
max_hing_length

273

In [17]:
hing_sentence

0                                    aaj humidity kaisi hai
1                                   foggiest city konsa hai
2              Mere paas agle reminder tak kitna time hai ?
3                         Tuesday ke liye mausam kaisa hoga
4         mujhe internet bill pay karne ke liye yaad dilaye
                                ...                        
189097    mujhe monday ko podiatrist appointment lene ke...
189098                                       Friday ko 8 pm
189099    A venture capital fund set up as a company ko ...
189100                   Crying family thread ka reply kare
189101    Olaf thoda aur hona chahiye,par thoda funny pa...
Name: Hinglish, Length: 189102, dtype: object

In [18]:
vocab_size = 2000
max_length = 100

text_vec_layer_en =tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_length
) 
text_vec_layer_hing = tf.keras.layers.TextVectorization(
    vocab_size,output_sequence_length=max_length 
    
)
text_vec_layer_en.adapt(en_sentence)
text_vec_layer_hing.adapt([f"startofseq {s} endofseq"for s in hing_sentence])

I0000 00:00:1756192846.629072      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [19]:
vocab_en = text_vec_layer_en.get_vocabulary()
vocab_en = [str(word) for word in vocab_en]
print(vocab_en[:10])
     

['', '[UNK]', 'the', 'to', 'for', 'is', 'i', 'my', 'in', 'me']


In [20]:

vocab_hing = text_vec_layer_hing.get_vocabulary()
vocab_hing = [str(word) for word in vocab_hing]
print(vocab_hing[:10])
     

['', '[UNK]', 'startofseq', 'endofseq', 'ke', 'liye', 'hai', 'ko', 'kya', 'me']


In [21]:
X_seq_enc = text_vec_layer_en(en_sentence)

X_seq_dec = text_vec_layer_hing([f"startofseq {s}" for s in hing_sentence])

Y_seq = text_vec_layer_hing([f"{s} endofseq" for s in hing_sentence])

In [23]:
X_train_enc = X_seq_enc[:150_000]
X_valid_enc = X_seq_enc[150_000:]
X_train_dec = X_seq_dec[:150_000]
X_valid_dec = X_seq_dec[150_000:]
Y_train = Y_seq[:150_000]
Y_valid = Y_seq[150_000:]

In [25]:
embed_size=128
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
b=encoder_embedding_layer(a)

In [26]:
b

<tf.Tensor: shape=(10, 100, 128), dtype=float32, numpy=
array([[[ 0.02086392, -0.01167547, -0.03631514, ...,  0.00073953,
          0.02860595, -0.01199303],
        [-0.02630136,  0.03214389, -0.04359876, ...,  0.02073013,
         -0.03197219,  0.01151714],
        [ 0.01756834,  0.04606367,  0.03038806, ..., -0.04580277,
         -0.01318791,  0.02762279],
        ...,
        [ 0.0046431 , -0.04426136,  0.04893133, ..., -0.02799054,
          0.04442294, -0.01258005],
        [ 0.0046431 , -0.04426136,  0.04893133, ..., -0.02799054,
          0.04442294, -0.01258005],
        [ 0.0046431 , -0.04426136,  0.04893133, ..., -0.02799054,
          0.04442294, -0.01258005]],

       [[-0.02329358, -0.02757799,  0.03076025, ..., -0.02242847,
          0.00144757, -0.01622494],
        [-0.02293028, -0.00953459,  0.04455011, ..., -0.04350164,
          0.00664977, -0.04159091],
        [ 0.01179552,  0.03692505,  0.02243737, ...,  0.03797451,
         -0.04480624, -0.04431727],
        ...

In [27]:
Y_train.shape

TensorShape([150000, 100])

In [28]:
Y_train = np.expand_dims(Y_train,-1)
Y_valid = np.expand_dims(Y_valid,-1)

In [34]:
print(Y_train.shape)
print(Y_valid.shape)

(150000, 100, 1)
(39102, 100, 1)


In [41]:
encoder_inputs = tf.keras.layers.Input(shape=(max_length,),dtype=tf.int32)
decoder_inputs = tf.keras.layers.Input(shape=(max_length,),dtype=tf.int32)

embed_size = 128
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,embed_size,
                                                    mask_zero=True)
encoder_embeddings  = encoder_embedding_layer(encoder_inputs)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
encoder = tf.keras.layers.LSTM(512,return_state=True,use_cudnn=False)
encoder_outputs,*encoder_states = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512,return_sequences=True,use_cudnn=False)
decoder_outputs = decoder(decoder_embeddings,initial_state=encoder_states)

output_layer = tf.keras.layers.Dense(vocab_size,activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [42]:
model = tf.keras.Model(inputs=[encoder_inputs,decoder_inputs],outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(
    (X_train_enc,X_train_dec),Y_train,
    epochs=10,
    batch_size = 16,
    validation_data=((X_valid_enc,X_valid_dec),Y_valid)
)

Epoch 1/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.0449 - loss: 2.9836

W0000 00:00:1756193642.524065     102 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1756193669.838516     102 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 56ms/step - accuracy: 0.0449 - loss: 2.9835 - val_accuracy: 0.0670 - val_loss: 1.5061
Epoch 2/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 56ms/step - accuracy: 0.0696 - loss: 1.3254 - val_accuracy: 0.0727 - val_loss: 1.1736
Epoch 3/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 55ms/step - accuracy: 0.0757 - loss: 0.9925 - val_accuracy: 0.0747 - val_loss: 1.0627
Epoch 4/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m514s[0m 55ms/step - accuracy: 0.0791 - loss: 0.8134 - val_accuracy: 0.0757 - val_loss: 1.0195
Epoch 5/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 55ms/step - accuracy: 0.0816 - loss: 0.7039 - val_accuracy: 0.0762 - val_loss: 1.0144
Epoch 6/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m512s[0m 55ms/step - accuracy: 0.0835 - loss: 0.6173 - val_accuracy: 0.0764 - val_loss: 1.0170
Epoch 7/1

In [46]:
def translate(sentence):
    translation = ""
    for _ in range(max_length):
        # Encode the English sentence
        X_enc = np.array([sentence])
        X_enc = text_vec_layer_en(X_enc)
        
        # Prepare decoder input with current translation
        X_dec = np.array([f"startofseq {translation.strip()}"])
        X_dec = text_vec_layer_hing(X_dec)
        
        # Predict next token probabilities
        y_proba = model.predict((X_enc, X_dec), verbose=0)  # shape: (1, seq_len, vocab_size)
        
        # Take the last timestep prediction
        next_token_id = np.argmax(y_proba[0, X_dec.shape[1]-1, :])
        next_word = text_vec_layer_hing.get_vocabulary()[next_token_id]
        
        # Stop if end token predicted
        if next_word == "endofseq":
            break
        
        # Append predicted word
        translation += " " + next_word
        
    return translation.strip()
