In [1]:


import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
# ------------------------------------------------
# 1️⃣ Data – just 3 simple sentence pairs
# ------------------------------------------------
english_sentences = [
    "i am going home",
    "how are you",
    "my name is john"
]

tamil_sentences = [
    "<start> நான் வீட்டுக்கு செல்கிறேன் <end>",
    "<start> நீ எப்படி இருக்கிறாய் <end>",
    "<start> எனது பெயர் ஜான் <end>"
]


In [3]:
# ------------------------------------------------
# 2️⃣ Tokenization
# ------------------------------------------------
eng_tokenizer = Tokenizer(lower=True, filters='')
tam_tokenizer = Tokenizer(lower=True, filters='')

eng_tokenizer.fit_on_texts(english_sentences)
tam_tokenizer.fit_on_texts(tamil_sentences)

input_texts = eng_tokenizer.texts_to_sequences(english_sentences)
target_texts = tam_tokenizer.texts_to_sequences(tamil_sentences)

max_encoder_seq_length = max(len(t) for t in input_texts)
max_decoder_seq_length = max(len(t) for t in target_texts)

num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(tam_tokenizer.word_index) + 1

encoder_input_data = pad_sequences(input_texts, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_texts, maxlen=max_decoder_seq_length, padding='post')

decoder_target_data = np.zeros(
    (len(english_sentences), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)
for i, seq in enumerate(target_texts):
    for t, word_id in enumerate(seq[1:]):  # decoder_target is shifted by one
        decoder_target_data[i, t, word_id] = 1.0

print("✅ Vocab sizes — EN:", num_encoder_tokens, "TA:", num_decoder_tokens)
print("✅ Sequence lengths — Encoder:", max_encoder_seq_length, "Decoder:", max_decoder_seq_length)


✅ Vocab sizes — EN: 12 TA: 12
✅ Sequence lengths — Encoder: 4 Decoder: 5


In [4]:
# ------------------------------------------------
# 3️⃣ Define Encoder–Decoder Model (NO ATTENTION)
# ------------------------------------------------
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [5]:
# ------------------------------------------------
# 4️⃣ Train (this will overfit quickly – that’s what we want)
# ------------------------------------------------
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=1,
    epochs=300,
    verbose=0
)

print("\n✅ Training complete! Model should now memorize all 3 translations.\n")



✅ Training complete! Model should now memorize all 3 translations.



In [6]:
# ------------------------------------------------
# 5️⃣ Inference setup (encoder + decoder models)
# ------------------------------------------------
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

reverse_tam_index = {i: word for word, i in tam_tokenizer.word_index.items()}


In [7]:

# ------------------------------------------------
# 6️⃣ Translation function
# ------------------------------------------------
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tam_tokenizer.word_index['<start>']

    decoded_sentence = ''
    for _ in range(max_decoder_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_tam_index.get(sampled_token_index, '')

        if sampled_word == '<end>' or sampled_word == '':
            break
        decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence.strip()


In [8]:
# ------------------------------------------------
# 7️⃣ Test translations
# ------------------------------------------------
test_sentences = ["i am going home", "how are you", "my name is john"]

for s in test_sentences:
    seq = eng_tokenizer.texts_to_sequences([s])
    seq = pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')
    print(f"\nEN: {s}")
    print(f"TA: {translate_sentence(seq)}")


EN: i am going home
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
TA: நான் வீட்டுக்கு செல்கிறேன்

EN: how are you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
TA: நீ எப்படி இருக்கிறாய்

EN: my name is john
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m 