<a href="https://colab.research.google.com/github/SaiChandraDevulapally/2303a51l14_27/blob/main/Assignment-7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense

# Example small dataset: English to French sentence pairs
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I am fine", "je vais bien"),
    ("what is your name", "comment tu t'appelles"),
    ("my name is", "je m'appelle"),
    ("thank you", "merci"),
    ("goodbye", "au revoir")
]

# (a) Data Preprocessing
english_sentences = [pair[0] for pair in data]
french_sentences = ['<start> ' + pair[1] + ' <end>' for pair in data]  # Adding start/end tokens for decoder

# Tokenize English and French sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_vocab_size = len(english_tokenizer.word_index) + 1  # plus 1 for padding

french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_tokenizer.word_index['<start>'] = len(french_tokenizer.word_index) + 1
french_tokenizer.word_index['<end>'] = len(french_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1  # plus 1 for padding

# Convert sentences to sequences
encoder_input_sequences = english_tokenizer.texts_to_sequences(english_sentences)
decoder_input_sequences = french_tokenizer.texts_to_sequences(french_sentences)

# Padding sequences
max_encoder_seq_length = max(len(seq) for seq in encoder_input_sequences)
max_decoder_seq_length = max(len(seq) for seq in decoder_input_sequences)

encoder_input_sequences = pad_sequences(encoder_input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_sequences = pad_sequences(decoder_input_sequences, maxlen=max_decoder_seq_length, padding='post')

# Prepare decoder output sequences (shifted by one position)
decoder_output_sequences = np.zeros_like(decoder_input_sequences)
decoder_output_sequences[:, :-1] = decoder_input_sequences[:, 1:]

# (b) Build Seq2Seq Model
encoder_inputs = Input(shape=(max_encoder_seq_length,))
encoder_embedding = Embedding(english_vocab_size, 64)(encoder_inputs)
encoder_gru = GRU(128, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)

decoder_inputs = Input(shape=(max_decoder_seq_length,))
decoder_embedding = Embedding(french_vocab_size, 64)(decoder_inputs)
decoder_gru = GRU(128, return_sequences=True)(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(french_vocab_size, activation='softmax')

decoder_outputs = decoder_dense(decoder_gru)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# (c) Train the Model
model.fit([encoder_input_sequences, decoder_input_sequences],
          np.expand_dims(decoder_output_sequences, -1),
          batch_size=2, validation_split=0.2)

# (d) Inference Setup for Translation
# Encoder model for extracting states
encoder_model = Model(encoder_inputs, encoder_state)

# Decoder model for generating output
decoder_state_input = Input(shape=(128,))
decoder_gru_inf = GRU(128, return_sequences=True, return_state=True)
decoder_outputs, decoder_state = decoder_gru_inf(decoder_embedding, initial_state=decoder_state_input)
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs, decoder_state_input], [decoder_outputs, decoder_state])

# (e) Translate New Sentences
def translate(input_sentence):
    # Encode the input sentence
    input_seq = english_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_encoder_seq_length, padding='post')

    # Get the initial state of the encoder
    states_value = encoder_model.predict(input_seq)

    # Start with the "start token" for the French sentence
    target_seq = np.array([[french_tokenizer.word_index['<start>']]])

    # Generate the output sequence one token at a time
    translated_sentence = ''
    for _ in range(max_decoder_seq_length):
        output_tokens, states_value = decoder_model.predict([target_seq, states_value])
        # Get the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Handle out-of-vocabulary words
        if sampled_token_index in french_tokenizer.index_word:
            sampled_word = french_tokenizer.index_word[sampled_token_index]
        else:
            sampled_word = "<unk>" # or any placeholder for unknown words

        # Stop if we reach the "end token"
        if sampled_word == '<end>':
            break

        translated_sentence += ' ' + sampled_word
        target_seq = np.array([[sampled_token_index]])

    return translated_sentence.strip()

# (f) Example: Translate New Sentences
input_sentence = "hello"
print("Input:", input_sentence)
print("Translated:", translate(input_sentence))

# Example: Translate another sentence
input_sentence = "thank you"
print("Input:", input_sentence)
print("Translated:", translate(input_sentence))

# (g) Experimenting and Improving the Model
# - Experiment with a larger dataset like the Tatoeba dataset for better generalization.
# - Try hyperparameter tuning: e.g., batch size, learning rate, number of GRU units, etc.
# - Consider adding an attention mechanism or using Transformer models for better performance.

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 375ms/step - accuracy: 0.1750 - loss: 2.8807 - val_accuracy: 0.5000 - val_loss: 2.8402
Input: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Translated: comment je je je je
Input: thank you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/