In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

2025-07-08 20:06:12.715053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752005172.985070      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752005173.059954      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
data = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv')

data = data[['en', 'fr']].iloc[:100]

In [4]:
data.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [5]:
# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    return text

# Correct the column names here
data['en'] = data['en'].apply(preprocess_text)
data['fr'] = data['fr'].apply(preprocess_text)

In [41]:
# Prepare inputs and outputs
eng_texts = data['en'].tolist()
fr_texts_in = ['<start> ' + text for text in data['fr'].tolist()]   # decoder input
fr_texts_out = [text + ' <end>' for text in data['fr'].tolist()]    # decoder output

# Tokenizers
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(eng_texts)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(fr_texts_in + fr_texts_out)
fr_vocab_size = len(fr_tokenizer.word_index) + 1

In [26]:
# Convert to sequences
encoder_input_seq = eng_tokenizer.texts_to_sequences(eng_texts)
decoder_input_seq = fr_tokenizer.texts_to_sequences(fr_texts_in)
decoder_output_seq = fr_tokenizer.texts_to_sequences(fr_texts_out)

# Padding
max_encoder_seq_length = max([len(seq) for seq in encoder_input_seq])
max_decoder_seq_length = max([len(seq) for seq in decoder_input_seq])

encoder_input_seq = pad_sequences(encoder_input_seq, maxlen=max_encoder_seq_length, padding='post')
decoder_input_seq = pad_sequences(decoder_input_seq, maxlen=max_decoder_seq_length, padding='post')
decoder_output_seq = pad_sequences(decoder_output_seq, maxlen=max_decoder_seq_length, padding='post')

# Decoder output needs to be one-hot encoded
decoder_output_data = tf.keras.utils.to_categorical(decoder_output_seq, num_classes=fr_vocab_size)


In [27]:
# Build Encoder-Decoder Model

latent_dim = 256  # Size of LSTM hidden states

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(fr_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [28]:
# Final Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
# Train the Model

history = model.fit(
    [encoder_input_seq, decoder_input_seq],
    decoder_output_data,
    batch_size=64,
    epochs=30,
    validation_split=0.2
)

Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - accuracy: 0.0907 - loss: 6.4379 - val_accuracy: 0.7762 - val_loss: 5.6347
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 359ms/step - accuracy: 0.7199 - loss: 5.5691 - val_accuracy: 0.7762 - val_loss: 3.5131
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 372ms/step - accuracy: 0.7264 - loss: 3.5950 - val_accuracy: 0.7762 - val_loss: 1.9196
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 370ms/step - accuracy: 0.7238 - loss: 2.2252 - val_accuracy: 0.7762 - val_loss: 1.6451
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 401ms/step - accuracy: 0.7240 - loss: 2.0650 - val_accuracy: 0.7762 - val_loss: 1.6370
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 380ms/step - accuracy: 0.7261 - loss: 2.0858 - val_accuracy: 0.7762 - val_loss: 1.5562
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━

In [30]:
# Build Inference Models

# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse decoder embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [31]:
# Function for Translation

reverse_eng_word_index = {idx: word for word, idx in eng_tokenizer.word_index.items()}
reverse_fr_word_index = {idx: word for word, idx in fr_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    # Encode input and get initial decoder states
    states_value = encoder_model.predict(input_seq)

    # Initialize target sequence with <start> token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_tokenizer.word_index['<start>']

    decoded_sentence = ''
    stop_condition = False

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fr_word_index.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()
    

In [44]:
# Test Translation

def translate_sentence(sentence):
    sentence = preprocess_text(sentence)
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')
    translation = decode_sequence(seq)
    print(f"Input sentence: {sentence}")
    print(f"Translated sentence: {translation}")


In [43]:
# Example
translate_sentence("i am very happy today")
translate_sentence("where are you going")
translate_sentence("this is a beautiful city")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


KeyError: '<start>'