In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/language-translation-englishfrench/eng_-french.csv


In [11]:
import pandas as pd
import numpy as np
import string
from string import digits
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

     
latent_dim = 256     
num_samples = 20000  

data_path = '/kaggle/input/language-translation-englishfrench/eng_-french.csv'

In [12]:
lines = pd.read_csv(data_path)
lines.columns = ['english_sentence', 'french_sentence'] 


lines = shuffle(lines)
lines = lines.iloc[:num_samples]


In [13]:

def clean_text(text):
    text = text.lower() 
    text = re.sub("'", "", text) 
    exclude = set(string.punctuation) 
    text = ''.join(ch for ch in text if ch not in exclude) 
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits) 
    text = text.strip()
    text = re.sub(" +", " ", text) 
    return text


lines['english_sentence'] = lines['english_sentence'].apply(clean_text)
lines['french_sentence'] = lines['french_sentence'].apply(clean_text)


lines['french_sentence'] = lines['french_sentence'].apply(lambda x : 'START_ '+ x + ' _END')


print(lines.head())

                                         english_sentence  \
164705  the opposite sides of a parallelogram are para...   
79229                          tom ran the fastest of all   
89402                          i didnt think youd tell me   
164766  they had been saving money for the trip for a ...   
71343                           the old man sat all alone   

                                          french_sentence  
164705  START_ les côtés opposés dun parallélogramme s...  
79229        START_ tom a couru le plus vite de tous _END  
89402   START_ je ne pensais pas que tu me laurais dit...  
164766  START_ elles économisaient de largent pour le ...  
71343    START_ le vieil homme était assis tout seul _END  


In [14]:

all_eng_words = set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)



all_french_words = set()
for fr in lines['french_sentence']:
    for word in fr.split():
        if word not in all_french_words:
            all_french_words.add(word)

In [15]:



input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_french_words))


num_encoder_tokens = len(all_eng_words) + 1
num_decoder_tokens = len(all_french_words) + 1


input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())


max_length_src = max(lines['english_sentence'].apply(lambda x: len(x.split())))
max_length_tar = max(lines['french_sentence'].apply(lambda x: len(x.split())))

print('Num English words:', num_encoder_tokens)
print('Num French words:', num_decoder_tokens)
print('Max Eng Length:', max_length_src)
print('Max Fr Length:', max_length_tar)

Num English words: 6373
Num French words: 10914
Max Eng Length: 31
Max Fr Length: 32


In [34]:

X, y = lines['english_sentence'], lines['french_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

def generate_batch(X, y, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            

            decoder_target_data[:, :, 0] = 1.
            
            batch_X = X[j:j+batch_size]
            batch_y = y[j:j+batch_size]
            
            for i, (input_text, target_text) in enumerate(zip(batch_X, batch_y)):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index.get(word, 0)
                
                for t, word in enumerate(target_text.split()):
                    if t < len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index.get(word, 0)
                    
                    if t > 0:
                  
                        decoder_target_data[i, t - 1, 0] = 0.
                       
                        decoder_target_data[i, t - 1, target_token_index.get(word, 0)] = 1.
            
            yield ((encoder_input_data, decoder_input_data), decoder_target_data)

In [35]:
# Encoder 
encoder_inputs = Input(shape=(None,))

enc_emb = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs) 

encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))

dec_emb_layer = Embedding(num_decoder_tokens, latent_dim) 
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [36]:
import tensorflow as tf


output_signature = (
    (
        tf.TensorSpec(shape=(None, max_length_src), dtype=tf.float32), # Encoder Input
        tf.TensorSpec(shape=(None, max_length_tar), dtype=tf.float32)  # Decoder Input
    ),
    tf.TensorSpec(shape=(None, max_length_tar, num_decoder_tokens), dtype=tf.float32) # Target Output
)


train_dataset = tf.data.Dataset.from_generator(
    lambda: generate_batch(X_train, y_train, batch_size=batch_size),
    output_signature=output_signature
)


val_dataset = tf.data.Dataset.from_generator(
    lambda: generate_batch(X_test, y_test, batch_size=batch_size),
    output_signature=output_signature
)




In [37]:
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

batch_size = 64      
epochs = 50

model.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=len(X_train) // batch_size,
    validation_data=val_dataset,
    validation_steps=len(X_test) // batch_size
)

Epoch 1/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 148ms/step - accuracy: 0.7645 - loss: 2.5520 - val_accuracy: 0.8018 - val_loss: 1.4124
Epoch 2/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - accuracy: 0.8081 - loss: 1.3604 - val_accuracy: 0.8026 - val_loss: 1.3995
Epoch 3/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 144ms/step - accuracy: 0.8081 - loss: 1.3495 - val_accuracy: 0.8041 - val_loss: 1.3862
Epoch 4/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 144ms/step - accuracy: 0.8090 - loss: 1.3344 - val_accuracy: 0.8035 - val_loss: 1.3763
Epoch 5/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - accuracy: 0.8098 - loss: 1.3203 - val_accuracy: 0.8062 - val_loss: 1.3534
Epoch 6/50
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 151ms/step - accuracy: 0.8124 - loss: 1.2910 - val_accuracy: 0.8099 - val_loss: 1.3216
Epoch 7/50

<keras.src.callbacks.history.History at 0x7cd6c7a36350>

In [38]:
# Encoder Inference Model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder Inference Model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs) 
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [44]:
def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq, verbose=0)
    

    target_seq = np.zeros((1,1))

    target_seq[0, 0] = target_token_index['START_']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
 
        if sampled_token_index in reverse_target_char_index:
            sampled_char = reverse_target_char_index[sampled_token_index]
        else:
            sampled_char = '?' 

        decoded_sentence += ' ' + sampled_char
        
 
        if (sampled_char == '_END' or len(decoded_sentence) > 50):
            stop_condition = True
            
      
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
        
    return decoded_sentence






In [47]:
for i in range(10): 
    seq = X_test.iloc[i]
    input_seq = np.zeros((1, max_length_src))
    for t, word in enumerate(seq.split()):
        input_seq[0, t] = input_token_index.get(word, 0)
        
    translation = decode_sequence(input_seq)
    print(f"English: {seq}")
 
    print(f"French: {translation.replace('_END', '')}")
    print("-" * 30)

English: you made it possible
French:  vous avez lair très très bon 
------------------------------
English: it is necessary for you to start now
French:  cest un peu plus de temps 
------------------------------
English: i am eating a sandwich
French:  je suis en train de vous voir 
------------------------------
English: how much does she spend per month
French:  combien de temps je suis en train de faire ça 
------------------------------
English: i havent read any of his letters
French:  je ne suis pas le plus de mal à ce sujet 
------------------------------
English: ive made up my mind to learn how to play the harp
French:  je suis désolé de me voir quelque chose à ma question
------------------------------
English: a mere glance is not enough for us to tell one from the other
French:  un homme de la guerre a été plus de temps à la maison
------------------------------
English: i must decline
French:  jai été en train de faire 
------------------------------
English: everyone was