In [1]:
import pandas as pd

df = pd.read_csv("../Data/translator_dataset.csv")

df.head()

Unnamed: 0,source_lang,target_lang,source_text,target_text
0,en,hi,We eat tea in the school.,हम स्कूल में चाय पीते हैं.
1,hi,en,हम स्कूल में चाय पीते हैं.,We eat tea in the school.
2,en,pa,We eat tea in the school.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।
3,pa,en,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,We eat tea in the school.
4,hi,pa,हम स्कूल में चाय पीते हैं.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।


In [2]:
def add_tokens(row):
    source_lang = row["source_lang"]
    target_lang = row["target_lang"]
    
    source = f"<{source_lang}> <to_{target_lang}> {row['source_text']}"
    target = f"<start> {row['target_text']} <end>"
    
    return pd.Series([source, target])

df[["encoder_input", "decoder_output"]] = df.apply(add_tokens, axis=1)

df.head()

Unnamed: 0,source_lang,target_lang,source_text,target_text,encoder_input,decoder_output
0,en,hi,We eat tea in the school.,हम स्कूल में चाय पीते हैं.,<en> <to_hi> We eat tea in the school.,<start> हम स्कूल में चाय पीते हैं. <end>
1,hi,en,हम स्कूल में चाय पीते हैं.,We eat tea in the school.,<hi> <to_en> हम स्कूल में चाय पीते हैं.,<start> We eat tea in the school. <end>
2,en,pa,We eat tea in the school.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<en> <to_pa> We eat tea in the school.,<start> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ। <end>
3,pa,en,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,We eat tea in the school.,<pa> <to_en> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<start> We eat tea in the school. <end>
4,hi,pa,हम स्कूल में चाय पीते हैं.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<hi> <to_pa> हम स्कूल में चाय पीते हैं.,<start> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ। <end>


In [3]:
max_encoder_len = 20
max_decoder_len = 20

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

all_text = list(df["encoder_input"]) + list(df["decoder_output"])

tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(all_text)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)


Vocabulary size: 414


In [5]:
encoder_sequences = tokenizer.texts_to_sequences(df["encoder_input"])
decoder_sequences = tokenizer.texts_to_sequences(df["decoder_output"])

encoder_input_data = pad_sequences(encoder_sequences, maxlen=max_encoder_len, padding='post')
decoder_input_data = pad_sequences(decoder_sequences, maxlen=max_decoder_len, padding='post')

decoder_output_data = decoder_input_data[:, 1:]
decoder_input_data = decoder_input_data[:, :-1]

In [6]:
embedding_dim = 256
latent_dim = 512

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Encoder
encoder_inputs = Input(shape=(max_encoder_len,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_decoder_len - 1,))
dec_emb_layer = Embedding(vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)




In [8]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 20)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 19)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 20, 256)              105984    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 19, 256)              105984    ['input_2[0][0]']             
                                                                                             

In [9]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=64,
    epochs=40,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/40


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40


In [11]:
from tensorflow.keras.models import Model

# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

In [12]:
from tensorflow.keras.layers import Input

# State inputs
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)

# Run LSTM with new state inputs
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2,
    initial_state=decoder_states_inputs
)

decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

In [13]:
model.save("full_model.h5")

In [14]:
encoder_model.save("encoder_model_v2.h5")
decoder_model.save("decoder_model_v2.h5")

import pickle
with open("tokenizer_v2.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

