**Data Preprocessing**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
data = pd.read_csv("eng_-french.csv")
engDF = data["English words/sentences"].tolist()
freDF = data["French words/sentences"].tolist()

**Tokenizing the data**

In [None]:
# Tokenizing english and french sentences
engTokenizer = Tokenizer()
engTokenizer.fit_on_texts(engDF)
engSeq = engTokenizer.texts_to_sequences(engDF)

freTokenizer = Tokenizer()
freTokenizer.fit_on_texts(freDF)
freSeq = freTokenizer.texts_to_sequences(freDF)

In [None]:
#Using the number of words in the tokenizer to define embedding size
engEmbeddingsize = len(engTokenizer.word_index) + 1
freEmbeddingsize = len(freTokenizer.word_index) + 1

In [None]:
# Padding based on max lengths
maxLength = max(len(seq) for seq in engSeq + freSeq)
engPadded = pad_sequences(engSeq, maxlen=maxLength, padding='post')
frePadded = pad_sequences(freSeq, maxlen=maxLength, padding='post')

**Building the model**

In [None]:
embedding_dim = 256
units = 512

# Building the encoder
encoder_inputs = Input(shape=(maxLength,))  #first layer which is the input layer
enc_emb = Embedding(input_dim=engEmbeddingsize, output_dim=embedding_dim)(encoder_inputs)  #second layer which is the embedding layer
encoder_lstm = LSTM(units, return_state=True)  #third layer which is the LSTM layer
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [None]:
# Building the Decoder
decoder_inputs = Input(shape=(maxLength,))
dec_emb_layer = Embedding(input_dim=freEmbeddingsize, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(freEmbeddingsize, activation='softmax')
output = decoder_dense(decoder_outputs)

In [None]:
# running the model
modelRnn = Model([encoder_inputs, decoder_inputs], output)

In [None]:
# compiling the model
modelRnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
modelRnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 55)]                 0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 55)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 55, 256)              3720192   ['input_1[0][0]']             
                                                                                                  
 embedding_2 (Embedding)     (None, 55, 256)              7849216   ['input_3[0][0]']             
                                                                                              

**Training the Model**

In [None]:
x_train, x_val, y_train, y_val = train_test_split(engPadded, frePadded, test_size=0.2)
modelRnn.fit([x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f1bcbf2e860>

**Testing Outputs**

In [None]:
def translate_sentence(sentence):
    seq = engTokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=maxLength, padding='post')
    translated = np.argmax(modelRnn.predict([padded, padded]), axis=-1)

    translated_sentence = []
    for i in translated[0]:
        if i in freTokenizer.index_word:
            translated_sentence.append(freTokenizer.index_word[i])
        else:
            translated_sentence.append(' ')  # Token inconnu si l'indice n'est pas trouvé dans le tokenizer

    return ' '.join(translated_sentence)

In [None]:
input_sentence = "Hi! I am learning french."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

Input: Hi! I am learning french.
Translated: salut je suis à français                                                                                                    


In [None]:
input_sentence = "Hi! I am good."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

Input: Hi! I am good.
Translated: salut je suis bonne                                                                                                      


In [None]:
input_sentence = "She ordered him to do it"
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

Input: She ordered him to do it
Translated: elle a ordonna de le faire                                                                                                  
