<a href="https://colab.research.google.com/github/Srivatsav515/NLP_3/blob/main/Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
data = [
    ("good morning", "bonjour"),
    ("good night", "bonne nuit"),
    ("see you later", "à plus tard"),
    ("have a nice day", "bonne journée"),
    ("thank you very much", "merci beaucoup"),
    ("excuse me", "excusez-moi"),
    ("I'm sorry", "je suis désolé")
]


In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Separate English and French phrases
english_sentences, french_sentences = zip(*data)

# Tokenize and pad sentences
eng_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
fr_tokenizer.fit_on_texts(french_sentences)

eng_seq = pad_sequences(eng_tokenizer.texts_to_sequences(english_sentences), padding='post')
fr_seq = pad_sequences(fr_tokenizer.texts_to_sequences(french_sentences), padding='post')

eng_vocab_size = len(eng_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1


In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Attention

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, 64)(encoder_inputs)
encoder_bi_lstm = Bidirectional(LSTM(64, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bi_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(fr_vocab_size, 64)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention
attention_layer = Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])

# Concatenate Attention and Decoder outputs
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_outputs])
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [15]:
decoder_input_data = fr_seq[:, :-1]  # Remove the last token
decoder_target_data = fr_seq[:, 1:]  # Remove the first token
decoder_target_data = np.expand_dims(decoder_target_data, -1)


In [16]:
# Train the model with a small number of epochs due to small data size
history = model.fit([eng_seq, decoder_input_data], decoder_target_data, batch_size=1, epochs=100)


Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.3420 - loss: 2.7054
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2726 - loss: 2.6605     
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4071 - loss: 2.6011 
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3946 - loss: 2.5228 
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6249 - loss: 2.2384 
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3082 - loss: 2.1134     
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3082 - loss: 1.9470     
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5624 - loss: 1.3815
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━

In [17]:
# Encoder inference model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder inference model
decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
attention_inference = attention_layer([decoder_lstm_outputs, encoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_outputs, attention_inference])
decoder_outputs = decoder_dense(decoder_concat_input)

decoder_model = Model([decoder_inputs] + decoder_states_inputs + [encoder_outputs], [decoder_outputs, state_h, state_c])


In [19]:
def translate_sentence(sentence):
    # Encode input sentence
    input_seq = pad_sequences(eng_tokenizer.texts_to_sequences([sentence]), maxlen=eng_seq.shape[1], padding='post')
    encoder_out, state_h, state_c = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_tokenizer.word_index['bonjour']  # Start token

    translated_sentence = ""
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq, state_h, state_c, encoder_out])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, '')

        translated_sentence += " " + sampled_word

        if sampled_word == '' or len(translated_sentence.split()) > fr_seq.shape[1]:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c

    return translated_sentence.strip()

# Testing translation
print("Translation of 'thank you very much':", translate_sentence("thank you very much"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Translation of 'thank you very much': beaucoup
