In [1]:
import pandas as pd

df = pd.read_csv("../Data/translator_dataset.csv")

print("Total rows:", len(df))
df.head()

Total rows: 17964


Unnamed: 0,source_lang,target_lang,source_text,target_text
0,en,hi,We eat tea in the school.,हम स्कूल में चाय पीते हैं.
1,hi,en,हम स्कूल में चाय पीते हैं.,We eat tea in the school.
2,en,pa,We eat tea in the school.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।
3,pa,en,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,We eat tea in the school.
4,hi,pa,हम स्कूल में चाय पीते हैं.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।


In [2]:
def add_tokens(row):
    source = f"<{row['source_lang']}> {row['source_text']}"
    target = f"<start> {row['target_text']} <end>"
    return pd.Series([source, target])

df[["encoder_input", "decoder_target"]] = df.apply(add_tokens, axis=1)

df.head()

Unnamed: 0,source_lang,target_lang,source_text,target_text,encoder_input,decoder_target
0,en,hi,We eat tea in the school.,हम स्कूल में चाय पीते हैं.,<en> We eat tea in the school.,<start> हम स्कूल में चाय पीते हैं. <end>
1,hi,en,हम स्कूल में चाय पीते हैं.,We eat tea in the school.,<hi> हम स्कूल में चाय पीते हैं.,<start> We eat tea in the school. <end>
2,en,pa,We eat tea in the school.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<en> We eat tea in the school.,<start> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ। <end>
3,pa,en,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,We eat tea in the school.,<pa> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<start> We eat tea in the school. <end>
4,hi,pa,हम स्कूल में चाय पीते हैं.,ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।,<hi> हम स्कूल में चाय पीते हैं.,<start> ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ। <end>


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

all_text = list(df["encoder_input"]) + list(df["decoder_target"])

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(all_text)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)


Vocabulary size: 411


In [4]:
encoder_sequences = tokenizer.texts_to_sequences(df["encoder_input"])
decoder_sequences = tokenizer.texts_to_sequences(df["decoder_target"])

In [5]:
max_encoder_len = max(len(seq) for seq in encoder_sequences)
max_decoder_len = max(len(seq) for seq in decoder_sequences)

encoder_padded = pad_sequences(encoder_sequences, maxlen=max_encoder_len, padding='post')
decoder_padded = pad_sequences(decoder_sequences, maxlen=max_decoder_len, padding='post')

print("Encoder shape:", encoder_padded.shape)
print("Decoder shape:", decoder_padded.shape)

Encoder shape: (17964, 10)
Decoder shape: (17964, 11)


In [6]:
import numpy as np

decoder_input = decoder_padded[:, :-1]
decoder_output = decoder_padded[:, 1:]

print("Decoder input shape:", decoder_input.shape)
print("Decoder output shape:", decoder_output.shape)

Decoder input shape: (17964, 10)
Decoder output shape: (17964, 10)


In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

embedding_dim = 128
latent_dim = 256

# ----- ENCODER -----
encoder_inputs = Input(shape=(max_encoder_len,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)

encoder_states = [state_h, state_c]

# ----- DECODER -----
decoder_inputs = Input(shape=(max_decoder_len - 1,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# ----- FULL MODEL -----
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 10)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 10)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 10, 128)              52608     ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 10, 128)              52608     ['input_2[0][0]']             
                                                                                             

In [8]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)




In [9]:
decoder_output = np.expand_dims(decoder_output, -1)

In [10]:
history = model.fit(
    [encoder_padded, decoder_input],
    decoder_output,
    batch_size=64,
    epochs=20,
    validation_split=0.1
)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

In [12]:
# Decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding2 = decoder_embedding
decoder_lstm_outputs, state_h2, state_c2 = decoder_lstm(
    decoder_embedding2,
    initial_state=decoder_states_inputs
)

decoder_states = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states
)

In [13]:
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}

In [15]:
def translate_sentence(source_lang, sentence):
    
    # Format input exactly like training
    input_text = f"<{source_lang}> {sentence}"
    
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([input_text])
    sequence = pad_sequences(sequence, maxlen=max_encoder_len, padding='post')

    # Get encoder states
    states_value = encoder_model.predict(sequence, verbose=0)

    # Start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index["<start>"]

    decoded_sentence = ""

    for _ in range(max_decoder_len):

        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value,
            verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, "")

        if sampled_word == "<end>" or sampled_word == "":
            break

        decoded_sentence += " " + sampled_word

        # Update target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()

In [28]:
print(translate_sentence("en", "We eat tea in the school."))
print(translate_sentence("hi", "हम स्कूल में चाय पीते हैं."))
print(translate_sentence("pa", "ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।"))

ਅਸੀਂ ਸਕੂਲ ਵਿੱਚ ਚਾਹ ਖਾਂਦੇ ਹਾਂ।
we eat tea in the school.
हम स्कूल में चाय पीते हैं.


In [17]:
model.save("translator_training_model.h5")

  saving_api.save_model(


In [18]:
encoder_model.save("encoder_model.h5")



In [19]:
decoder_model.save("decoder_model.h5")



In [20]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [25]:
pip install fastapi uvicorn jinja2 python-multipart

Collecting fastapi
  Downloading fastapi-0.133.1-py3-none-any.whl (109 kB)
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/109.0 kB ? eta -:--:--
     --- ------------------------------------ 10.2/109.0 kB ? eta -:--:--
     ---------- -------------------------- 30.7/109.0 kB 435.7 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/109.0 kB 393.8 kB/s eta 0:00:01
     ------------------------------- ----- 92.2/109.0 kB 585.1 kB/s eta 0:00:01
     ------------------------------- ----- 92.2/109.0 kB 585.1 kB/s eta 0:00:01
     ------------------------------------ 109.0/109.0 kB 486.6 kB/s eta 0:00:00
Collecting uvicorn
  Downloading uvicorn-0.41.0-py3-none-any.whl (68 kB)
     ---------------------------------------- 0.0/68.8 kB ? eta -:--:--
     ---------------------- --------------- 41.0/68.8 kB 991.0 kB/s eta 0:00:01
     --------------------------------- ---- 61.4/68.8 kB 812.7 kB/s eta 0:00:01



[notice] A new release of pip is available: 23.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
