<a href="https://colab.research.google.com/github/Rajfekar/PythonML/blob/main/Encoder_Decoder_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow




In [14]:
import pandas as pd

df = pd.read_csv("/content/Dataset_English_Hindi.csv")

# Only keep required columns
df = df[['English', 'Hindi']].dropna()

# Convert to string (in case of mixed types)
df['English'] = df['English'].astype(str)
df['Hindi'] = df['Hindi'].astype(str)

df


Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।
...,...,...
130471,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
130473,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
130474,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import json


# Add special tokens to target language
hindi_sentences_tagged = ["start " + sent + " end" for sent in df["Hindi"].tolist()]

# English tokenizer (input)
tokenizer = Tokenizer(filters='', lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(df["English"].tolist())

# Hindi tokenizer (target)
tokenizer_hindi = Tokenizer(filters='', lower=True, oov_token='<OOV>')
tokenizer_hindi.fit_on_texts(hindi_sentences_tagged)

# Reverse tokenizer (for decoding predictions)
reverse_tokenizer = {str(index): word for word, index in tokenizer_hindi.word_index.items()}


In [None]:
# # Save tokenizers to disk
# with open("tokenizer.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)

# with open("tokenizer_hindi.pkl", "wb") as f:
#     pickle.dump(tokenizer_hindi, f)

# # Save reverse word index to JSON
# with open("rev_token_hindi.json", "w") as f:
#     json.dump(reverse_tokenizer, f)


In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# 1. Load & clean data
df = pd.read_csv("/content/Dataset_English_Hindi.csv")

df.dropna(subset=["English", "Hindi"], inplace=True)

# 2. Prepare tokenizers
hindi_sentences_tagged = ["start " + sent + " end" for sent in df["Hindi"].tolist()]

tokenizer = Tokenizer(filters='', lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(df["English"].tolist())

tokenizer_hindi = Tokenizer(filters='', lower=True, oov_token='<OOV>')
tokenizer_hindi.fit_on_texts(hindi_sentences_tagged)

reverse_tokenizer = {str(index): word for word, index in tokenizer_hindi.word_index.items()}

# 3. Save tokenizers
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("tokenizer_hindi.pkl", "wb") as f:
    pickle.dump(tokenizer_hindi, f)

with open("rev_token_hindi.json", "w") as json_file:
    json.dump(reverse_tokenizer, json_file)

# 4. Preprocess input/output sequences
max_len = 260

encoder_input = tokenizer.texts_to_sequences(df["English"].tolist())
encoder_input = pad_sequences(encoder_input, maxlen=max_len, padding='post')

decoder_input = tokenizer_hindi.texts_to_sequences(hindi_sentences_tagged)
decoder_input = pad_sequences(decoder_input, maxlen=max_len, padding='post')

decoder_target = np.zeros_like(decoder_input)
decoder_target[:, :-1] = decoder_input[:, 1:]
decoder_target[:, -1] = 0

# 5. Build the encoder-decoder model
num_encoder_tokens = len(tokenizer.word_index) + 1
num_decoder_tokens = len(tokenizer_hindi.word_index) + 1
embedding_dim = 256
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(num_decoder_tokens, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 6. Train
model.fit(
    [encoder_input, decoder_input],
    np.expand_dims(decoder_target, -1),
    batch_size=64,
    epochs=2,
    validation_split=0.2
)

# 7. Save models
model.save("base_encoder_decoder_model_new.h5")
model.save("base_encoder_decoder_model_new.keras")


Epoch 1/2


In [11]:
def predict_translation(input_text):
    try:
        max_len = 260
        encoder_input = tokenizer.texts_to_sequences([input_text])
        encoder_input = pad_sequences(encoder_input, maxlen=max_len, padding='post')

        sos_token = tokenizer_hindi.word_index.get('start', 1)
        eos_token = tokenizer_hindi.word_index.get('end', 2)

        decoder_input = np.zeros((1, max_len))
        decoder_input[0, 0] = sos_token

        output_sentence = []

        for i in range(1, max_len):
            # Predict the next token
            predictions = model.predict([encoder_input, decoder_input], verbose=0)
            predicted_token_id = np.argmax(predictions[0, i - 1, :])

            # Stop if we hit end or pad
            if predicted_token_id == eos_token or predicted_token_id == 0:
                break

            predicted_word = reverse_tokenizer.get(predicted_token_id, '<unk>')
            output_sentence.append(predicted_word)

            decoder_input[0, i] = predicted_token_id

        return ' '.join(output_sentence)

    except Exception as e:
        return f"Error: {str(e)}"


In [12]:
# Example
reverse_tokenizer = {index: word for word, index in tokenizer_hindi.word_index.items()}

pred = predict_translation("hello!")
pred

KeyboardInterrupt: 