<a href="https://colab.research.google.com/github/Sri-Pooja00/2203A51341_NLP/blob/main/2203A51341_07_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Small dataset of English-French sentence pairs
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I am fine", "je vais bien"),
    ("what is your name", "comment tu t'appelles"),
    ("my name is", "je m'appelle"),
    ("thank you", "merci"),
    ("goodbye", "au revoir")
]

# Separate English and French sentences
english_sentences, french_sentences = zip(*data)

# Tokenization
eng_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()

eng_tokenizer.fit_on_texts(english_sentences)
fr_tokenizer.fit_on_texts(french_sentences)

# Convert to sequences
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences)

# Padding
max_eng_len = max(len(seq) for seq in eng_sequences)
max_fr_len = max(len(seq) for seq in fr_sequences)

eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')


In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define the LSTM encoder
latent_dim = 256  # Number of units in the LSTM layer
vocab_size = len(eng_tokenizer.word_index) + 1 # Get vocabulary size
embedding_dim = 128 # Define embedding dimension


encoder_inputs = Input(shape=(None,))
# Add Embedding layer to handle variable-length input and map words to vectors
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(None,))
# Add Embedding layer for the decoder
decoder_embedding = Embedding(input_dim=len(fr_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(len(fr_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [5]:
# Preparing the target data by shifting it for teacher forcing
target_data = np.zeros((len(french_sentences), max_fr_len, len(fr_tokenizer.word_index) + 1), dtype="float32")

for i, sequence in enumerate(fr_sequences):
    for t, word in enumerate(sequence):
        if t > 0:
            target_data[i, t - 1, word] = 1


In [6]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit([eng_padded, fr_padded], target_data, batch_size=64, epochs=100, validation_split=0.2)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.2000 - loss: 1.2323 - val_accuracy: 0.0000e+00 - val_loss: 0.4403
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.1333 - loss: 1.2238 - val_accuracy: 0.0000e+00 - val_loss: 0.4419
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.2667 - loss: 1.2165 - val_accuracy: 0.0000e+00 - val_loss: 0.4435
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.2667 - loss: 1.2094 - val_accuracy: 0.0000e+00 - val_loss: 0.4453
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.3333 - loss: 1.2021 - val_accuracy: 0.0000e+00 - val_loss: 0.4473
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.3333 - loss: 1.1941 - val_accuracy: 0.0000e+00 - val_loss: 0.4497
Epoch 7/100


In [7]:
# Define inference models
encoder_model = Model(encoder_inputs, [state_h, state_c])

# Decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [8]:
def translate_sentence(input_seq):
    # Encode the input sequence to get the states
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first token of target sequence with the start token
    target_seq[0, 0] = fr_tokenizer.word_index['<start>']

    translated_sentence = []
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word[sampled_token_index]
        translated_sentence.append(sampled_word)

        # Exit condition: either hit max length or find stop token.
        if sampled_word == '<end>' or len(translated_sentence) > max_fr_len:
            break

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return ' '.join(translated_sentence)


In [20]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to preprocess the input sentence
def preprocess_sentence(sentence, tokenizer, max_len):
    sentence_seq = tokenizer.texts_to_sequences([sentence])
    padded_sentence = pad_sequences(sentence_seq, maxlen=max_len, padding='post')
    return padded_sentence

# Function to translate an English sentence to French
def translate_sentence(sentence):
    # Preprocess input sentence
    input_seq = preprocess_sentence(sentence, eng_tokenizer, max_eng_len)

    # Encode the input sentence to get initial states
    states_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with the start token
    start_token_index = fr_tokenizer.word_index.get('<start>', 1)  # 1 if '<start>' not found
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token_index

    # Store the translation result
    translated_sentence = []

    # Generate translation word by word
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the index of the most likely word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, '')

        # Exit if end token is found or max length is reached
        if sampled_word == '<end>' or len(translated_sentence) > max_fr_len:
            break

        translated_sentence.append(sampled_word)

        # Update the target sequence and states for the next iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    # Join the list of words into a single string
    return ' '.join(translated_sentence)

# Taking input from the user
english_sentence = input("Enter an English sentence to translate: ")
french_translation = translate_sentence(english_sentence)

print("English:", english_sentence)
print("French:", french_translation)


Enter an English sentence to translate: What are you doing?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
English: What are you doing?
French: ça va va bien
