#### A Seq2Seq (Sequence-to-Sequence) model using LSTM networks for English-to-French translation. 

In [1]:
# Import Required Libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import numpy as np

2025-02-02 09:31:47.643083: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-02 09:31:52.937573: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-02 09:31:56.103418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738468918.425242    4920 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738468918.740551    4920 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-02 09:32:02.338805: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
# Sample English-French Sentences
english_sentences = ['hello', 'how are you', 'good morning']
french_sentences = ['bonjour', 'comment ça va', 'bonjour']

In [3]:
# Tokenization and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [4]:
# Function to Tokenize and Pad Sequences
def tokenize_and_pad(sentences, num_words=1000, max_len=10):
    tokenizer = Tokenizer(num_words=num_words, filters='')
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    return tokenizer, padded


In [5]:
# Applying Tokenization

eng_tokenizer, eng_input_data = tokenize_and_pad(english_sentences)
french_tokenizer, french_target_data = tokenize_and_pad(french_sentences)

In [6]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1
max_len = 10  # Maximum sequence length


In [7]:
# Building the Encoder
def build_encoder(vocab_size, embedding_dim=256, lstm_units=256):
    encoder_inputs = Input(shape=(max_len,))
    enc_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(lstm_units, return_state=True)
    _, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]
    return encoder_inputs, encoder_states


In [8]:
# Create Encoder
encoder_inputs, encoder_states = build_encoder(eng_vocab_size)

2025-02-02 09:33:30.468375: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [9]:
# Building the Decoder
def build_decoder(vocab_size, embedding_dim=256, lstm_units=256):
    decoder_inputs = Input(shape=(max_len,))
    dec_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    return decoder_inputs, decoder_outputs


In [10]:
# Create Decoder
decoder_inputs, decoder_outputs = build_decoder(french_vocab_size)


In [11]:
# Creating the Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [12]:
# Preparing Training Data
y_train = np.expand_dims(french_target_data, axis=-1)


In [13]:
# Training the Model
model.fit([eng_input_data, french_target_data], y_train, batch_size=32, epochs=10)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15s/step - accuracy: 0.0333 - loss: 1.6100
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - accuracy: 0.1667 - loss: 1.5732
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - accuracy: 0.1667 - loss: 1.5358
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.1667 - loss: 1.4964
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.1667 - loss: 1.4535
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - accuracy: 0.1667 - loss: 1.4060
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.1667 - loss: 1.3525
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.1667 - loss: 1.2920
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x7f0d8c3a3b20>

In [18]:
# Inferencing for translation
def translate_sentence(sentence):
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    states_value = model.layers[2].predict(padded_sequence)

    target_seq = np.zeros((1, max_len))
    target_seq[0, 0] = french_tokenizer.word_index['start'] if 'start' in french_tokenizer.word_index else 1

    decoded_sentence = ''
    for _ in range(max_len):
        output_tokens, h, c = model.layers[3].predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = french_tokenizer.index_word.get(sampled_token_index, '')
        if sampled_word == 'end':
            break
        decoded_sentence += ' ' + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

In [19]:
print(translate_sentence('hello'))

AttributeError: 'Embedding' object has no attribute 'predict'

In [20]:
try:
    import tensorflow as tf
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
    import numpy as np
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
except ModuleNotFoundError:
    print("TensorFlow is not installed. Running in a non-TensorFlow environment.")
    tf = None

# Sample Data (English-French pairs)
english_sentences = ['hello', 'how are you', 'good morning']
french_sentences = ['bonjour', 'comment ça va', 'bonjour']

def tokenize_and_pad(sentences, num_words=1000, max_len=10):
    tokenizer = Tokenizer(num_words=num_words, filters='')
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    return tokenizer, padded

# Tokenize English and French
eng_tokenizer, eng_input_data = tokenize_and_pad(english_sentences)
french_tokenizer, french_target_data = tokenize_and_pad(french_sentences)

eng_vocab_size = len(eng_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1
max_len = 10  # Maximum sequence length

if tf:
    def build_encoder(vocab_size, embedding_dim=256, lstm_units=256):
        encoder_inputs = Input(shape=(max_len,))
        enc_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
        encoder_lstm = LSTM(lstm_units, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
        encoder_states = [state_h, state_c]
        encoder_model = Model(encoder_inputs, encoder_states)
        return encoder_inputs, encoder_states, encoder_model

    encoder_inputs, encoder_states, encoder_model = build_encoder(eng_vocab_size)

    def build_decoder(vocab_size, embedding_dim=256, lstm_units=256):
        decoder_inputs = Input(shape=(max_len,))
        dec_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
        decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
        decoder_outputs, state_h, state_c = decoder_lstm(dec_emb, initial_state=encoder_states)
        decoder_dense = Dense(vocab_size, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)
        decoder_model = Model([decoder_inputs] + encoder_states, [decoder_outputs, state_h, state_c])
        return decoder_inputs, decoder_outputs, decoder_model

    decoder_inputs, decoder_outputs, decoder_model = build_decoder(french_vocab_size)

    try:
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        y_train = np.expand_dims(french_target_data, axis=-1)
        model.fit([eng_input_data, french_target_data], y_train, batch_size=32, epochs=10)
    except Exception as e:
        print(f"Error occurred during model compilation or training: {e}")

    def translate_sentence(sentence):
        sequence = eng_tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
        states_value = encoder_model.predict(padded_sequence)

        target_seq = np.zeros((1, max_len))
        target_seq[0, 0] = french_tokenizer.word_index['start'] if 'start' in french_tokenizer.word_index else 1

        decoded_sentence = ''
        for _ in range(max_len):
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = french_tokenizer.index_word.get(sampled_token_index, '')
            if sampled_word == 'end':
                break
            decoded_sentence += ' ' + sampled_word
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

        return decoded_sentence.strip()

    # Example usage
    print(translate_sentence('hello'))
else:
    print("Skipping model creation and training due to missing TensorFlow.")


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step - accuracy: 0.1000 - loss: 1.6016
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step - accuracy: 0.1667 - loss: 1.5656
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step - accuracy: 0.1667 - loss: 1.5289
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.1667 - loss: 1.4901
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 229ms/step - accuracy: 0.1667 - loss: 1.4480
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step - accuracy: 0.1667 - loss: 1.4016
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step - accuracy: 0.1667 - loss: 1.3497
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step - accuracy: 0.1667 - loss: 1.2914
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
bonjour bonjour bonjour bonjour bonjour bonjour bonjour bonjour bonjour bonjour
