<a href="https://colab.research.google.com/github/Rahulrama6705/perplexity/blob/main/3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# --- 1. Data Preparation ---
# Simplified English-French parallel corpus
english_sentences = [
    "I am a student.",
    "How are you?",
    "This is a book.",
    "She likes to read.",
    "We are learning machine translation."
]

french_sentences = [
    "Je suis un étudiant.",
    "Comment allez-vous ?",
    "C'est un livre.",
    "Elle aime lire.",
    "Nous apprenons la traduction automatique."
]

# Add start and end tokens to target sequences for training
french_sentences_input = ['<start> ' + sentence for sentence in french_sentences]
french_sentences_target = [sentence + ' <end>' for sentence in french_sentences]

# Tokenization for English (Encoder Input)
eng_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n')
eng_tokenizer.fit_on_texts(english_sentences)
eng_input_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_seq_len = max(len(seq) for seq in eng_input_sequences)

# Tokenization for French (Decoder Input and Target Output)
# Remove '<' and '>' from filters to preserve <start> and <end> tokens
fra_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~\t\n')
fra_tokenizer.fit_on_texts(frozenset(french_sentences_input + french_sentences_target)) # Fit on both to ensure all tokens are included
fra_input_sequences = fra_tokenizer.texts_to_sequences(frozenset(french_sentences_input))
fra_target_sequences = fra_tokenizer.texts_to_sequences(frozenset(french_sentences_target))
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_max_seq_len = max(len(seq) for seq in fra_input_sequences + fra_target_sequences)

# Pad sequences to ensure uniform length
encoder_input_data = pad_sequences(eng_input_sequences, maxlen=eng_max_seq_len, padding='post')
decoder_input_data = pad_sequences(fra_input_sequences, maxlen=fra_max_seq_len, padding='post')

# Prepare decoder target data (one-hot encoded)
decoder_target_data = np.zeros(
    (len(french_sentences), fra_max_seq_len, fra_vocab_size),
    dtype='float32'
)

for i, seq in enumerate(fra_target_sequences):
    for j, word_index in enumerate(seq):
        if word_index > 0: # Ensure not padding (0)
            decoder_target_data[i, j, word_index] = 1.

print(f"Encoder Input Shape: {encoder_input_data.shape}")
print(f"Decoder Input Shape: {decoder_input_data.shape}")
print(f"Decoder Target Shape: {decoder_target_data.shape}")

# --- 2. Model Architecture ---
latent_dim = 256 # Dimensionality of the encoding space.

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(fra_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the full model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# --- 3. Training ---
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

print("\nTraining the model (this will be quick due to small dataset)...\n")
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=len(english_sentences), # Use all data in one batch for this small example
    epochs=100,
    validation_split=0.0
)

# --- 4. Inference (Prediction) Model ---
# Encoder model (extracts states from encoder_inputs)
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model setup (takes encoder states as initial states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding,
    initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Reverse lookup dictionaries for decoding sequences back to words
reverse_eng_word_index = dict((i, word) for word, i in eng_tokenizer.word_index.items())
reverse_fra_word_index = dict((i, word) for word, i in fra_tokenizer.word_index.items())

def decode_sequence(input_seq):
    # Encode the input sequence to get the internal state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fra_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fra_word_index.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length or find stop token.
        if (sampled_word == '<end>' or len(decoded_sentence.split()) > fra_max_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
    return decoded_sentence.replace(' <end>', '').strip()

# --- 5. Test the model ---
print("\n--- Testing the model ---")
for seq_index in range(len(encoder_input_data)):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(f"Input: {english_sentences[seq_index]}")
    print(f"Actual: {french_sentences[seq_index]}")
    print(f"Predicted: {decoded_sentence}")
    print('--')




Encoder Input Shape: (5, 5)
Decoder Input Shape: (5, 6)
Decoder Target Shape: (5, 6, 20)

Training the model (this will be quick due to small dataset)...

Epoch 1/100


  eng_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n')
  fra_tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~\t\n')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 2.2942
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - loss: 2.2754
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 2.2591
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 2.2426
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 2.2245
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 2.2034
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 2.1778
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 2.1453
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 2.1034
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 2.0515
Epoch 11/100
[1m1/1[0

'\n### Flow of the Seq2Seq Encoder-Decoder Model for English-to-French Translation:\n\n1.  **Data Preparation:**\n    *   **Corpus:** We start with parallel sentences in English and French. For the French sentences, special `<start>` and `<end>` tokens are added to mark the beginning and end of a sequence, which is crucial for the decoder during training and inference.\n    *   **Tokenization:** `Tokenizer` from Keras is used to convert words into numerical indices. It builds a vocabulary from the provided text.\n        *   `eng_tokenizer` for English sentences.\n        *   `fra_tokenizer` for French sentences (fitted on both input and target French sentences to ensure all potential words are in the vocabulary).\n    *   **Sequence Conversion:** Text sentences are converted into sequences of integer indices.\n    *   **Padding:** `pad_sequences` is applied to make all sequences the same length by adding zeros. This is necessary for batch processing in neural networks.\n    *   **One-