In [5]:
!unzip -q /content/archive.zip -d /content/dataset
!ls /content/dataset

Dataset


In [16]:
import os
import re
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Configuration ---
BATCH_SIZE = 64
EPOCHS = 20  # Start with a small number like 20-50
LATENT_DIM = 256  # Dimensionality of the LSTM state
NUM_SAMPLES = 20000  # Number of samples to train on (for speed)

def create_dummy_files():
    """
    Creates dummy corpus files for demonstration.
    You can comment out or delete this function when using your real files.
    """
    print("Creating dummy files (english-corpus.txt, urdu-corpus.txt)...")

    english_content = [
        "Hello world.", "How are you?", "This is a parallel corpus.",
        "Python is a great language.", "I love machine learning.", "Run.",
        "Where is the book?", "This is my house.", "What is your name?",
        "See you later."
    ]

    urdu_content = [
        "ہیلو دنیا۔", "آپ کیسے ہیں؟", "یہ ایک متوازی کارپس ہے۔",
        "پائتھون ایک بہترین زبان ہے۔", "مجھے مشین لرننگ سے محبت ہے۔", "بھاگو۔",
        "کتاب کہاں ہے؟", "یہ میرا گھر ہے۔", "آپ کا نام کیا ہے؟",
        "پھر ملیں گے."
    ]

    try:
        with open("english-corpus.txt", "w", encoding="utf-8") as f:
            for line in english_content: f.write(line + "\n")
        with open("urdu-corpus.txt", "w", encoding="utf-8") as f:
            for line in urdu_content: f.write(line + "\n")
        print("Dummy files created successfully.\n")
    except IOError as e:
        print(f"Error creating dummy files: {e}")

def preprocess_sentence(sentence):
    """
    Cleans and standardizes a single sentence.
    """
    sentence = sentence.lower().strip()
    # Remove punctuation
    sentence = re.sub(f"[{re.escape(string.punctuation)}]", "", sentence)
    # Add start and end tokens
    sentence = f"[START] {sentence} [END]"
    return sentence

def load_data(english_filepath, urdu_filepath, num_samples):
    """
    Loads and preprocesses the data from text files.
    """
    input_texts = []
    target_texts = []

    try:
        with open(english_filepath, "r", encoding="utf-8") as f_eng, \
             open(urdu_filepath, "r", encoding="utf-8") as f_urd:

            for eng_line, urd_line in zip(f_eng, f_urd):
                if len(input_texts) >= num_samples:
                    break

                eng_line = eng_line.strip()
                urd_line = urd_line.strip()

                if eng_line and urd_line:
                    input_texts.append(preprocess_sentence(eng_line))
                    target_texts.append(preprocess_sentence(urd_line))

    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please make sure both files exist.")
        return None, None

    print(f"Loaded {len(input_texts)} sentence pairs.")
    return input_texts, target_texts

def build_tokenizers(input_texts, target_texts):
    """
    Creates and fits Keras Tokenizers for both languages.
    """
    # Tokenizer for English (input)
    input_tokenizer = Tokenizer(filters='')
    input_tokenizer.fit_on_texts(input_texts)

    # Tokenizer for Urdu (target)
    target_tokenizer = Tokenizer(filters='')
    target_tokenizer.fit_on_texts(target_texts)

    return input_tokenizer, target_tokenizer

def prepare_sequences(input_texts, target_texts, input_tokenizer, target_tokenizer):
    """
    Converts text sentences to padded integer sequences.
    """
    # Convert text to integer sequences
    input_sequences = input_tokenizer.texts_to_sequences(input_texts)
    target_sequences = target_tokenizer.texts_to_sequences(target_texts)

    # Get max sequence lengths
    max_encoder_seq_length = max(len(s) for s in input_sequences)
    max_decoder_seq_length = max(len(s) for s in target_sequences)

    print(f"Max encoder sequence length: {max_encoder_seq_length}")
    print(f"Max decoder sequence length: {max_decoder_seq_length}")

    # Pad sequences
    encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
    decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

    return encoder_input_data, decoder_input_data, max_encoder_seq_length, max_decoder_seq_length

def build_model(input_vocab_size, target_vocab_size, max_encoder_seq_length, max_decoder_seq_length, latent_dim):
    """
    Builds the Seq2Seq Encoder-Decoder model.
    """
    # --- ENCODER ---
    encoder_inputs = Input(shape=(max_encoder_seq_length,), name="encoder_input")
    # Embedding layer
    enc_emb = Embedding(input_vocab_size, latent_dim, name="encoder_embedding")(encoder_inputs)
    # LSTM layer
    # We discard encoder outputs and only keep the final states (h and c).
    encoder_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
    _, state_h, state_c = encoder_lstm(enc_emb)
    # These states will be used as the initial state for the decoder.
    encoder_states = [state_h, state_c]

    # --- DECODER ---
    decoder_inputs = Input(shape=(max_decoder_seq_length,), name="decoder_input")
    # Embedding layer
    # Note: The decoder embedding size must match the encoder's latent_dim if you share them,
    # but here we use the same latent_dim for simplicity.
    dec_emb_layer = Embedding(target_vocab_size, latent_dim, name="decoder_embedding")
    dec_emb = dec_emb_layer(decoder_inputs)

    # Decoder LSTM
    # We set this LSTM to return the full sequence of outputs.
    # It uses the encoder's states as its initial state.
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

    # Output layer
    # A Dense layer with a softmax activation to get a probability distribution
    # over the target vocabulary for each time step.
    decoder_dense = Dense(target_vocab_size, activation="softmax", name="decoder_output")
    decoder_outputs = decoder_dense(decoder_outputs)

    # --- MODEL ---
    # The model that turns `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Compile the model
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return model

# --- Main execution ---
if __name__ == "__main__":

    # 1. Create dummy files (comment out if using your real files)
    create_dummy_files()

    # 2. Define filepaths
    english_file = "/content/dataset/Dataset/english-corpus.txt"
    urdu_file = "/content/dataset/Dataset/urdu-corpus.txt"

    # 3. Load and preprocess data
    input_texts, target_texts = load_data(english_file, urdu_file, NUM_SAMPLES)

    if input_texts:
        # 4. Build tokenizers
        input_tokenizer, target_tokenizer = build_tokenizers(input_texts, target_texts)

        # 5. Prepare sequences
        encoder_input_data, decoder_input_data, max_enc_len, max_dec_len = prepare_sequences(
            input_texts, target_texts, input_tokenizer, target_tokenizer
        )

        # 6. Prepare decoder target data
        # This is the "teacher forcing" part. The decoder_target_data is one
        # time-step ahead of the decoder_input_data.
        # e.g., input = "[START] how are you"
        #       target = "how are you [END]"
        decoder_target_data = np.zeros_like(decoder_input_data)
        for i, seq in enumerate(decoder_input_data):
            decoder_target_data[i, :-1] = seq[1:]
        # Add a third dimension for the loss function
        decoder_target_data = np.expand_dims(decoder_target_data, -1)

        # Get vocabulary sizes (add 1 for the 0-padding)
        input_vocab_size = len(input_tokenizer.word_index) + 1
        target_vocab_size = len(target_tokenizer.word_index) + 1

        print(f"Input vocabulary size: {input_vocab_size}")
        print(f"Target vocabulary size: {target_vocab_size}")

        # 7. Build the model
        model = build_model(
            input_vocab_size, target_vocab_size, max_enc_len, max_dec_len, LATENT_DIM
        )

        model.summary()

        # 8. Train the model
        print("\n--- Starting Model Training ---")
        model.fit(
            [encoder_input_data, decoder_input_data],
            decoder_target_data,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            validation_split=0.2,
        )
        print("--- Model Training Complete ---")

        # Note: Saving the model and creating a separate "inference" model
        # (to translate new sentences) is the next step.
        # This script only covers the training.



Creating dummy files (english-corpus.txt, urdu-corpus.txt)...
Dummy files created successfully.

Loaded 20000 sentence pairs.
Max encoder sequence length: 16
Max decoder sequence length: 21
Input vocabulary size: 5146
Target vocabulary size: 5398



--- Starting Model Training ---
Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.7160 - loss: 2.6904 - val_accuracy: 0.7776 - val_loss: 1.3849
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 1s/step - accuracy: 0.7798 - loss: 1.3301 - val_accuracy: 0.7883 - val_loss: 1.2985
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 1s/step - accuracy: 0.7933 - loss: 1.2401 - val_accuracy: 0.7996 - val_loss: 1.2190
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 1s/step - accuracy: 0.8077 - loss: 1.1407 - val_accuracy: 0.8125 - val_loss: 1.1375
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 1s/step - accuracy: 0.8218 - loss: 1.0455 - val_accuracy: 0.8242 - val_loss: 1.0587
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 1s/step - accuracy: 0.8345 - loss: 0.9480 - val_accuracy: 0.8314 - val_lo

In [20]:
# =================================================================
#
#                         INFERENCE SETUP
#             (Run this in a new cell after training)
#
# =================================================================

# --- 1. Re-build the Encoder Model ---
# This model takes the English sentence and outputs the LSTM states (the "thought vector")
encoder_inputs = model.input[0]  # This is the encoder_inputs from training
encoder_outputs, state_h_enc, state_c_enc = model.get_layer('encoder_lstm').output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

# --- 2. Re-build the Decoder Model ---
# This model takes the [START] token (and subsequent tokens) + the encoder's states
# and outputs the next predicted word + its own new states.

# Define new Input layers for the decoder's states
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='decoder_state_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Input for a single token (at each step of the loop)
decoder_input_single = Input(shape=(1,), name='decoder_input_single')

# Get the trained layers
dec_emb_layer = model.get_layer('decoder_embedding')
decoder_lstm = model.get_layer('decoder_lstm')
decoder_dense = model.get_layer('decoder_output')

# Wire the layers for inference
dec_emb_single = dec_emb_layer(decoder_input_single)
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    dec_emb_single, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_outputs = decoder_dense(decoder_outputs)

# The final decoder model
decoder_model = Model(
    [decoder_input_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

print("Inference models (encoder/decoder) built successfully.")
# decoder_model.summary() # You can uncomment this if you want to see the summary again

# --- 3. Create Reverse Token Lookups ---
# We need to map token indices (like '5') back to words (like 'hello')
reverse_input_word_index = {i: word for word, i in input_tokenizer.word_index.items()}
reverse_target_word_index = {i: word for word, i in target_tokenizer.word_index.items()}

# Get the special [START] and [END] token indices for the target language (Urdu)
start_token_index = target_tokenizer.word_index['[start]']
end_token_index = target_tokenizer.word_index['[end]']


# --- 4. Define the Translation Function ---

def translate_sentence(input_sentence):
    """
    Translates a single English sentence to Urdu using the inference models.
    """
    # 1. Preprocess the input sentence
    # Note: The preprocess_sentence function must be available from the training cell
    clean_sentence = preprocess_sentence(input_sentence)

    # 2. Convert to sequence and pad
    input_seq = input_tokenizer.texts_to_sequences([clean_sentence])
    input_seq_padded = pad_sequences(input_seq, maxlen=max_enc_len, padding='post')

    # 3. Get the "thought vector" (initial states) from the encoder
    states_value = encoder_model.predict(input_seq_padded, verbose=0)

    # 4. Start the decoding loop
    # Initialize the loop with the [START] token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token_index

    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        # 5. Predict the next word
        # Pass the current token and the current states to the decoder
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # 6. Get the most likely word (token ID)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # 7. Convert the ID to a word
        # Use .get() for safety in case the index is 0 (padding)
        sampled_word = reverse_target_word_index.get(sampled_token_index, '[UNK]')

        # 8. Check for stop condition
        if (sampled_word == '[end]' or
            len(decoded_sentence) > max_dec_len):
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        # 9. Update for the next loop
        # The next input token is the one we just predicted
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        # The next states are the ones we just got from the decoder
        states_value = [h, c]

    # 10. Return the final sentence
    return " ".join(decoded_sentence)

# --- 5. Test the Translator ---

print("\n--- Testing Translations ---")

test_sentences = [
    # --- Sentences from your training data ---
    "How are you?",
    "Run.",
    "This is my house.",
    "I love machine learning.",
    "Where is the book?",
    "Hello world.",
    "What is your name?",
    "Python is a great language.",
    "See you later.",
    "This is a parallel corpus.",

    # --- New simple sentences (high chance of working) ---
    "This is a book.",
    "You are great.",
    "What is this?",

    # --- Sentences with new words (will test the [UNK] token) ---
    "I am learning Python.",
    "She is a good person.",
    "Where is the car?",
    "He likes to read."
]

# --- 5. Test the Translator ---
print("\n--- Testing Translations ---")

for sentence in test_sentences:
    translation = translate_sentence(sentence)
    print(f"Input:       {sentence}")
    print(f"Translation: {translation}\n")

Inference models (encoder/decoder) built successfully.

--- Testing Translations ---

--- Testing Translations ---
Input:       How are you?
Translation: آپ کیسا ہیں

Input:       Run.
Translation: زان تخیل کی

Input:       This is my house.
Translation: یہ میرا گھر ہے

Input:       I love machine learning.
Translation: میں کتوں سے ڈرتا ہوں

Input:       Where is the book?
Translation: کتاب کہاں ہے

Input:       Hello world.
Translation: خدا نے ناشتہ بنایا

Input:       What is your name?
Translation: آپ کا نام کیا ہے

Input:       Python is a great language.
Translation: یہ بہت اچھا ہے

Input:       See you later.
Translation: بعد میں آپ کو فون کریں

Input:       This is a parallel corpus.
Translation: یہ ایک کار ہے

Input:       This is a book.
Translation: یہ ایک کتاب ہے

Input:       You are great.
Translation: آپ بہت اچھا ہیں

Input:       What is this?
Translation: یہ کیا ہے؟

Input:       I am learning Python.
Translation: میں کوریائی ہوں

Input:       She is a good person.
Tran