In [11]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example corpus with '<start>' and '<end>' tokens
corpus = [
    ("Hello", "<start> नमस्ते <end>"),
    ("How are you?", "<start> आप कैसे हैं? <end>"),
    ("I am fine, thank you.", "<start> मैं ठीक हूँ, धन्यवाद। <end>"),
    ("What is your name?", "<start> आपका नाम क्या है? <end>"),
    ("My name is John.", "<start> मेरा नाम जॉन है। <end>"),
    ("Nice to meet you.", "<start> आप से मिलकर अच्छा लगा। <end>"),
    ("Good morning", "<start> शुभ प्रभात <end>"),
    ("Good night", "<start> शुभ रात्रि <end>"),
    ("Thank you", "<start> धन्यवाद <end>"),
    ("Yes", "<start> हाँ <end>"),
    ("No", "<start> नहीं <end>")
]




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Prepare tokenizers
eng_tokenizer = Tokenizer(filters='')
hin_tokenizer = Tokenizer(filters='')

eng_texts = [pair[0] for pair in corpus]
hin_texts = [pair[1] for pair in corpus]

eng_tokenizer.fit_on_texts(eng_texts)
hin_tokenizer.fit_on_texts(hin_texts)

eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)
hin_sequences = hin_tokenizer.texts_to_sequences(hin_texts)

num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(hin_tokenizer.word_index) + 1

max_encoder_seq_length = max([len(seq) for seq in eng_sequences])
max_decoder_seq_length = max([len(seq) for seq in hin_sequences])

latent_dim = 512
embedding_dim = 200  # Dimension of the embedding space


In [None]:

# Define and compile the training model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')




In [None]:
# Preparing the data for the training model
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(hin_sequences, maxlen=max_decoder_seq_length, padding='post')

decoder_target_data = np.zeros((len(corpus), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
for i, seq in enumerate(hin_sequences):
    for t, word_index in enumerate(seq):
        if t > 0:
            decoder_target_data[i, t - 1, word_index] = 1.0



In [None]:
# Train the model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=32,
    epochs=100,
    validation_split=0.2
)

# Define the encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


In [None]:

# New embedding layer for inference model
decoder_inputs_inf = Input(shape=(None,))
decoder_embedding_inf = Embedding(num_decoder_tokens, embedding_dim, mask_zero=True)(decoder_inputs_inf)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_embedding_inf, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model(
    [decoder_inputs_inf] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)

def preprocess_input_sentence(sentence):
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_encoder_seq_length, padding='post')
    return padded_sequence

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index['<start>']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = hin_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

def translate_sentence(sentence):
    input_seq = preprocess_input_sentence(sentence)
    translated_sentence = decode_sequence(input_seq)
    return translated_sentence


In [12]:
# Example usage
input_sentence = "Hello"
translated_sentence = translate_sentence(input_sentence)
print(f'Input: {input_sentence}')
print(f'Translated: {translated_sentence}')


Input: Hello
Translated: नमस्ते
