In [3]:
data = [
      ("hello", "bonjour"),
      ("how are you", "comment ça va"),
      ("I am fine", "je vais bien"),
      ("what is your name", "comment tu t'appelles"),
      ("my name is", "je m'appelle"),
      ("thank you", "merci"),
      ("goodbye", "au revoir")
       ]



In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer for English (input) sentences
english_tokenizer = Tokenizer()
english_sentences = [pair[0] for pair in data]
english_tokenizer.fit_on_texts(english_sentences)
input_sequences = english_tokenizer.texts_to_sequences(english_sentences)
input_sequences = pad_sequences(input_sequences, padding='post')

# Tokenizer for French (target) sentences
french_tokenizer = Tokenizer()
french_sentences = [pair[1] for pair in data]
french_tokenizer.fit_on_texts(french_sentences)
target_sequences = french_tokenizer.texts_to_sequences(french_sentences)
target_sequences = pad_sequences(target_sequences, padding='post')

# Vocabulary sizes
input_vocab_size = len(english_tokenizer.word_index) + 1
target_vocab_size = len(french_tokenizer.word_index) + 1


In [5]:
import numpy as np

decoder_input_data = target_sequences[:, :-1]
decoder_target_data = target_sequences[:, 1:]


In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_vocab_size, 64)(encoder_inputs)
encoder_lstm = LSTM(64, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(target_vocab_size, 64)(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')


In [7]:
model.fit(
      [input_sequences, decoder_input_data],
       np.expand_dims(decoder_target_data, -1),
       batch_size=16,
       epochs=100
         )



Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 2.6377
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 2.6095
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 2.5864
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 2.5635
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 2.5388
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.5114
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.4801
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 2.4440
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.4023
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 2.3543
Epoch 11/10

<keras.src.callbacks.history.History at 0x7b3e41853c40>

In [8]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states
  )


In [12]:
def translate_sentence(input_sentence):
      input_seq = english_tokenizer.texts_to_sequences([input_sentence])
      input_seq = pad_sequences(input_seq, maxlen=input_sequences.shape[1], padding='post')
      states_value = encoder_model.predict(input_seq)

      target_seq = np.zeros((1, 1))
      target_seq[0, 0] = french_tokenizer.word_index['<start>']  # Assuming <start> is the start token

      stop_condition = False
      translated_sentence = ""
      while not stop_condition:
         output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

         sampled_token_index = np.argmax(output_tokens[0, -1, :])
         sampled_word = french_tokenizer.index_word.get(sampled_token_index, '')

         if sampled_word == '<end>' or len(translated_sentence.split()) > target_sequences.shape[1]:
            stop_condition = True
         else:
              translated_sentence += ' ' + sampled_word

         target_seq = np.zeros((1, 1))
         target_seq[0, 0] = sampled_token_index
         states_value = [h, c]

         return translated_sentence.strip()
