<a href="https://colab.research.google.com/github/UppuSushma/NLP-Lab/blob/main/NLP_Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.
# Example data (small English to French pairs)

data = [ ("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"), ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"), ("thank you", "merci"), ("goodbye", "au revoir") ]

**1 Data Preprocessing**

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
data = [("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"),
        ("thank you", "merci"), ("goodbye", "au revoir")]

# Split English and French sentences
english_texts, french_texts = zip(*data)

# Tokenize English text
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_length = max(len(seq) for seq in eng_sequences)

# Tokenize French text
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_texts)
fr_sequences = fr_tokenizer.texts_to_sequences(french_texts)
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_max_length = max(len(seq) for seq in fr_sequences)

# Pad sequences
eng_sequences = pad_sequences(eng_sequences, maxlen=eng_max_length, padding='post')
fr_sequences = pad_sequences(fr_sequences, maxlen=fr_max_length, padding='post')


**2 Build Seq2Seq Model**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder
encoder_inputs = Input(shape=(eng_max_length,))
encoder_embedding = Embedding(eng_vocab_size, 256)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(fr_max_length,))
decoder_embedding = Embedding(fr_vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


**3 Preparing the Data for Training**

In [None]:
# Shift decoder input sequences by one timestep to get target sequences
fr_target_sequences = np.expand_dims(fr_sequences, -1)

# Prepare training input and output
X_train = [eng_sequences, fr_sequences]
Y_train = fr_target_sequences


**4 Train the model on the dataset**

In [None]:
# Train the model
model.fit(X_train, Y_train, batch_size=2, epochs=100, validation_split=0.2)


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 289ms/step - accuracy: 0.1083 - loss: 2.6362 - val_accuracy: 0.5000 - val_loss: 2.5846
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.6250 - loss: 2.5668 - val_accuracy: 0.5000 - val_loss: 2.5251
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6250 - loss: 2.4958 - val_accuracy: 0.5000 - val_loss: 2.4498
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.7083 - loss: 2.3836 - val_accuracy: 0.5000 - val_loss: 2.3067
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.5250 - loss: 2.1910 - val_accuracy: 0.5000 - val_loss: 2.1050
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3875 - loss: 2.1277 - val_accuracy: 0.5000 - val_loss: 1.9929
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b8ea59b2950>

**5 Inference Setup for Translation**

In [None]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


**6 Translate New Sentences**

In [None]:
def translate_sentence(input_text):
    # Tokenize and pad input text
    input_seq = eng_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=eng_max_length, padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq)

    # Initialize target sequence with start token
    target_seq = np.zeros((1, 1))
    translation = ''

    for _ in range(fr_max_length):
        # Predict the next word
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, None)

        # If end of sentence, break
        if sampled_word == 'end' or sampled_word is None:
            break
        translation += sampled_word + ' '

        # Update target sequence and states
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return translation.strip()


**7 Experimenting and Improving the Model by large dataset and hyper tune parameter.**

**(a)  Load a Larger Dataset**

In [None]:
import pandas as pd

# Assume the dataset is saved in a file named "train.csv" with English and French columns
# Replace with the actual path and column names based on the dataset format.
dataset_path = '/content/train.csv'
# Specify the encoding as 'latin-1' to handle the problematic characters
data = pd.read_csv(dataset_path, delimiter='\t', names=['english', 'french'], encoding='latin-1')

# Take a sample of the dataset (e.g., 10,000 pairs for faster tuning)
data = data.sample(10000, random_state=1).reset_index(drop=True)

english_texts = data['english'].values
french_texts = data['french'].values

# Tokenize, pad, and prepare sequences as done with the smaller dataset

**(b)Define a HyperModel with Keras Tuner**

In [None]:
!pip install keras-tuner -q
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    # Hyperparameters to tune
    embedding_dim = hp.Int('embedding_dim', min_value=128, max_value=512, step=64)
    lstm_units = hp.Int('lstm_units', min_value=128, max_value=512, step=64)
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])

    # Encoder
    encoder_inputs = Input(shape=(eng_max_length,))
    encoder_embedding = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(fr_max_length,))
    decoder_embedding = Embedding(fr_vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(fr_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model compilation
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

**(c) Initialize and Run the Hyperparameter Tuning**

In [None]:
# Initialize the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of different hyperparameter combinations to try
    executions_per_trial=1,  # Number of times to train the model with each configuration
    directory='tuner_results',
    project_name='english_to_french_translation'
)

# Prepare the data for the tuner
fr_target_sequences = np.expand_dims(fr_sequences, -1)
X_train = [eng_sequences, fr_sequences]
Y_train = fr_target_sequences

# Run the hyperparameter search
tuner.search(X_train, Y_train, epochs=50, batch_size=32, validation_split=0.2)


Trial 10 Complete [00h 00m 13s]
val_accuracy: 0.5

Best val_accuracy So Far: 0.6666666865348816
Total elapsed time: 00h 02m 17s


**(d) Get the Best Model and Evaluate**

In [None]:
# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate or save the best model
best_model.evaluate(X_train, Y_train)
best_model.save('best_translation_model.h5')


  saveable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 837ms/step - accuracy: 0.4286 - loss: 2.6316




**(e) Additional Improvements: Fine-Tuning and Regularization**


In [21]:
from tensorflow.keras.layers import Dropout

def build_model_with_dropout(hp):
    embedding_dim = hp.Int('embedding_dim', min_value=128, max_value=512, step=64)
    lstm_units = hp.Int('lstm_units', min_value=128, max_value=512, step=64)
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)

    encoder_inputs = Input(shape=(eng_max_length,))
    encoder_embedding = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(fr_max_length,))
    decoder_embedding = Embedding(fr_vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_outputs = Dropout(dropout_rate)(decoder_outputs)
    decoder_dense = Dense(fr_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model
