<a href="https://colab.research.google.com/github/QaziSaim/Fine-Tune-Projects/blob/main/English_to_French_Encoder_Decoder_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

In [2]:
eng_texts, fra_texts = [], []

with open("/content/fra.txt", "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

for line in lines:
    parts = line.split("\t")
    if len(parts) >= 2:   # ensure at least English + French
        eng = parts[0].strip()
        fra = parts[1].strip()
        eng_texts.append(eng)
        fra_texts.append(fra)


In [3]:
import re, string

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text.strip()

eng_texts = [clean_text(t) for t in eng_texts]
fra_texts = ["<start> " + clean_text(t) + " <end>" for t in fra_texts]


In [4]:
eng_tokenizer  = Tokenizer()
eng_tokenizer.fit_on_texts(eng_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)


In [7]:
fra_tokenizer = Tokenizer()
fra_tokenizer.fit_on_texts(fra_texts)
fra_sequences = fra_tokenizer.texts_to_sequences(fra_texts)

In [8]:
# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fra_vocab_size = len(fra_tokenizer.word_index) + 1


In [9]:
# Sequence lengths
max_eng_len = max(len(seq) for seq in eng_sequences)
max_fra_len = max(len(seq) for seq in fra_sequences)


In [10]:
# Pad sequences
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
decoder_input_data = pad_sequences(fra_sequences, maxlen=max_fra_len, padding='post')



In [12]:
decoder_target_data = np.zeros((len(fra_sequences), max_fra_len, fra_vocab_size), dtype="float32")

In [13]:
for i, seq in enumerate(fra_sequences):
    for t, word_id in enumerate(seq[1:]):  # skip <start>
        decoder_target_data[i, t, word_id] = 1.0

# Train-Test split
enc_train, enc_val, dec_in_train, dec_in_val, dec_tar_train, dec_tar_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1
)


In [16]:
dec_in_val[10]

array([  1, 152,  56, 921,   2,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [19]:
# -------------------
# 4. Encoder-Decoder Model
# -------------------
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fra_len,))
dec_emb_layer = Embedding(fra_vocab_size, latent_dim, mask_zero=True)   # <-- define layer only once
dec_emb = dec_emb_layer(decoder_inputs)                                # <-- use same layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full training model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [20]:
# -------------------
# 5. Training
# -------------------
model.fit(
    [enc_train, dec_in_train], dec_tar_train,
    batch_size=64,
    epochs=100,
    validation_data=([enc_val, dec_in_val], dec_tar_val)
)


Epoch 1/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 42ms/step - accuracy: 0.0858 - loss: 5.2104 - val_accuracy: 0.1072 - val_loss: 3.8604
Epoch 2/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.1102 - loss: 3.6582 - val_accuracy: 0.1232 - val_loss: 3.5331
Epoch 3/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.1287 - loss: 3.2542 - val_accuracy: 0.1419 - val_loss: 3.2704
Epoch 4/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.1469 - loss: 2.9107 - val_accuracy: 0.1520 - val_loss: 3.0798
Epoch 5/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.1584 - loss: 2.6307 - val_accuracy: 0.1636 - val_loss: 2.9361
Epoch 6/100
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.1692 - loss: 2.4119 - val_accuracy: 0.1708 - val_loss: 2.8354
Epoch 7/100
[1

<keras.src.callbacks.history.History at 0x79df40967260>

In [21]:
model.summary()

In [23]:
model.save('encoder_translation.keras')

In [26]:
import pickle
with open('eng_tokenizer.pkl','wb') as handler:
  pickle.dump(eng_tokenizer,handler)

with open('fra_tokenizer.pkl','wb') as handler:
  pickle.dump(fra_tokenizer,handler)


In [28]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# dec_emb2 = dec_emb(decoder_inputs)  # reuse embedding layer
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [29]:
reverse_eng_index = {i: word for word, i in eng_tokenizer.word_index.items()}
reverse_fra_index = {i: word for word, i in fra_tokenizer.word_index.items()}
fra_word_index = fra_tokenizer.word_index


In [35]:
start_token = fra_tokenizer.word_index.get("<start>")
end_token = fra_tokenizer.word_index.get("<end>")

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    decoded_sentence = []
    stop_condition = False

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fra_index.get(sampled_token_index, '')

        if sampled_token_index == end_token or len(decoded_sentence) > max_fra_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return " ".join(decoded_sentence)


In [37]:
def predict_translation(sentence):
    # Clean and tokenize
    seq = eng_tokenizer.texts_to_sequences([sentence.lower()])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')

    # Decode
    return decode_sequence(seq)

# Example test
print("English: Go.")
print("Predicted French:", predict_translation("I am Angry."))

print("English: I am hungry.")
print("Predicted French:", predict_translation("I am hungry."))


English: Go.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 