# PART 1: Code of Model for Translation Part

In [None]:
###################
#Libraries imports#
###################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Dropout, LSTM, TimeDistributed, Bidirectional
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import pickle

In [None]:
#################
#Data processing#
#################

#Load and split the dataset
full_dataset = pd.read_csv('eng_french.csv')
train_data, test_data = train_test_split(full_dataset, test_size=0.2, random_state=42)
train_data, validation_data = train_test_split(train_data, test_size=0.1, random_state=42)  # 10% for validation


#Function to calculate the max length for padding
def calculate_max_length(sentences, percentile=95):
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    return int(np.percentile(sentence_lengths, percentile))

#Calculate max_length based on the dataset
eng_lengths = calculate_max_length(full_dataset['English words/sentences'])
fr_lengths = calculate_max_length(full_dataset['French words/sentences'])
max_length = max(eng_lengths, fr_lengths)
print(f"Chosen max_length: {max_length}")

#Tokenizer fitting
eng_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()
    #English Tockenizer
eng_tokenizer.fit_on_texts(full_dataset['English words/sentences'])
    #French Tockenizer
fr_tokenizer.fit_on_texts(full_dataset['French words/sentences'])

#Tokenization and padding functions
def text_to_sequences(tokenizer, text):
    return tokenizer.texts_to_sequences(text)

#Padding
def sequence_padding(sequences, length=None):
    return pad_sequences(sequences, maxlen=length, padding='post')

#Data Generator function
#This part is used her but not necessary, it is to train the model on a very large dataset of millions data

def data_generator(data, batch_size, tokenizer_en, tokenizer_fr, max_length):
    while True:
        for i in range(0, len(data), batch_size):
            chunk = data[i:i+batch_size]
            english_sentences = chunk['English words/sentences']
            french_sentences = chunk['French words/sentences']

            eng_sequences = text_to_sequences(tokenizer_en, english_sentences)
            fr_sequences = text_to_sequences(tokenizer_fr, french_sentences)

            eng_sequences = sequence_padding(eng_sequences, max_length)
            fr_sequences = sequence_padding(fr_sequences, max_length)

            yield fr_sequences, eng_sequences


In [None]:
####################################
#Creation and training of the model#
####################################

#Model architecture
def create_translation_model(input_dim, output_dim, eng_vocab_size, fr_vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=fr_vocab_size, output_dim=256, input_length=input_dim))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.4))
    model.add(TimeDistributed(Dense(eng_vocab_size, activation='softmax')))

    model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(0.001), metrics=['accuracy'])
    return model

#Initialize and summarize the model
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1
translation_model = create_translation_model(max_length, max_length, eng_vocab_size, fr_vocab_size)
translation_model.summary()

#Training the model with generator including validation data
batch_size = 32
history = translation_model.fit_generator(
    data_generator(train_data, batch_size, eng_tokenizer, fr_tokenizer, max_length),
    steps_per_epoch=len(train_data) // batch_size,
    validation_data=data_generator(validation_data, batch_size, eng_tokenizer, fr_tokenizer, max_length),
    validation_steps=len(validation_data) // batch_size,
    epochs=10)

#Plotting training and validation accuracy and loss
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='orange')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()


In [None]:
###############
#Model Testing#
###############

#Preprocess the test data
test_english = test_data['English words/sentences']
test_french = test_data['French words/sentences']

test_eng_sequences = text_to_sequences(eng_tokenizer, test_english)
test_fr_sequences = text_to_sequences(fr_tokenizer, test_french)

test_eng_sequences = sequence_padding(test_eng_sequences, max_length)
test_fr_sequences = sequence_padding(test_fr_sequences, max_length)

#Evaluate the model on test data
test_loss, test_accuracy = translation_model.evaluate(test_fr_sequences, test_eng_sequences, verbose=0)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


In [None]:
#################################################
#Saving the model and tockenizers for prediction#
#################################################

from keras.models import load_model
import pickle

#Save model
translation_model.save('translation_model4.h5')

#Save tokenizers
with open('eng_tokenizer4.pkl', 'wb') as handle:
    pickle.dump(eng_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fr_tokenizer4.pkl', 'wb') as handle:
    pickle.dump(fr_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# PART 2: This part is for prediction and can be used in another file

In [None]:
####################
#Librairies imports#
####################

from keras.models import load_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Dropout, LSTM, TimeDistributed, Bidirectional
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import pickle

#################################
#Tokenizer and padding functions#
#################################

def text_to_sequences(tokenizer, text):
    return tokenizer.texts_to_sequences(text)
def sequence_padding(sequences, length=None):
    return pad_sequences(sequences, maxlen=length, padding='post')
max_length = 12

#################################
#Translation prediction function#
#################################

def translate(input_text):
    #Load the saved model and tokenizers
    model = load_model('translation_model4.h5')
    with open('eng_tokenizer4.pkl', 'rb') as handle:
        eng_tokenizer = pickle.load(handle)
    with open('fr_tokenizer4.pkl', 'rb') as handle:
        fr_tokenizer = pickle.load(handle)

    #Preprocess the input text
    sequences = text_to_sequences(fr_tokenizer, [input_text])
    padded_sequences = sequence_padding(sequences, max_length)

    #Make a prediction
    prediction = model.predict(padded_sequences)
    predicted_sequence = np.argmax(prediction, axis=-1)[0]

    #Reverse word indices for both languages
    reverse_fr_word_index = dict(map(reversed, fr_tokenizer.word_index.items()))
    reverse_eng_word_index = dict(map(reversed, eng_tokenizer.word_index.items()))

    #Convert the predicted sequence to text with fallback to French word
    translated_text = []
    for idx, word_idx in enumerate(padded_sequences[0]):
        if word_idx > 0:  # Ignore padding
            french_word = reverse_fr_word_index.get(word_idx, '')
            translated_word = reverse_eng_word_index.get(predicted_sequence[idx], french_word)
            translated_text.append(translated_word)

    return ' '.join(translated_text)

In [None]:
#########
#example#
#########

input_text = "le ciel est bleu  "   #Example French sentence
translated_text = translate(input_text)
print(translated_text)