Importing All Required Dependencies

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Attention, Concatenate, Input
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from nltk.translate.bleu_score import corpus_bleu

Data Loading and Cleaning

In [None]:
def load_and_clean_data(file_path):
    # Read the data from the file
    df = pd.read_csv(file_path, sep="\t", usecols=[0, 1], names=["English", "French"])

    # Define a function to clean text
    def clean_text(text):
        # Remove non-alphabetic characters and convert to lowercase
        text = re.sub(r"[^a-zA-Z\s]", '', text).lower()
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    # Apply the cleaning function to both English and French columns
    df['English'] = df['English'].apply(clean_text)
    df['French'] = df['French'].apply(clean_text)

    return df

# Load and clean the data
df = load_and_clean_data("/kaggle/input/fra.txt") #Insert path of the File that contains English text translated to French


Data Splitting and Tokenization

In [None]:
def split_and_tokenize_data(df, max_len=30):
    # Split the dataset into features (X) and labels (y)
    X = df['English']
    y = df['French']

    # Split the data into training, validation, and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Initialize tokenizers for English and French languages
    tokenizer_English = Tokenizer(oov_token='<UNK>')
    tokenizer_French = Tokenizer(oov_token='<UNK>')

    # Fit tokenizers on training data
    tokenizer_English.fit_on_texts(X_train)
    tokenizer_French.fit_on_texts(y_train)

    # Define a function to convert text to padded sequences
    def text_to_padded_sequences(tokenizer, texts, max_len):
        sequences = tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=max_len, padding='post')

    # Convert training, validation, and testing data to padded sequences
    padded_sequences_train_English = text_to_padded_sequences(tokenizer_English, X_train, max_len)
    padded_sequences_train_French = text_to_padded_sequences(tokenizer_French, y_train, max_len).reshape((-1, max_len, 1))
    padded_sequences_val_English = text_to_padded_sequences(tokenizer_English, X_val, max_len)
    padded_sequences_val_French = text_to_padded_sequences(tokenizer_French, y_val, max_len).reshape((-1, max_len, 1))
    padded_sequences_test_English = text_to_padded_sequences(tokenizer_English, X_test, max_len)
    padded_sequences_test_French = text_to_padded_sequences(tokenizer_French, y_test, max_len).reshape((-1, max_len, 1))

    # Return the padded sequences along with the tokenizers
    return (padded_sequences_train_English, padded_sequences_train_French,
            padded_sequences_val_English, padded_sequences_val_French,
            padded_sequences_test_English, padded_sequences_test_French,
            tokenizer_English, tokenizer_French)

# Call the function to split and tokenize the data
(padded_sequences_train_English, padded_sequences_train_French,
 padded_sequences_val_English, padded_sequences_val_French,
 padded_sequences_test_English, padded_sequences_test_French,
 tokenizer_English, tokenizer_French) = split_and_tokenize_data(df)


Embedding Preparation

In [None]:
def load_embeddings(glove_path, embedding_dim):
    # Initialize an empty dictionary to store word embeddings
    embedding_index = {}
    # Open the GloVe embedding file
    with open(glove_path, encoding='utf-8') as f:
        # Iterate through each line in the file
        for line in f:
            # Split the line by whitespace to separate the word and its embedding vector
            values = line.split()
            # Extract the word (first element) from the line
            word = values[0]
            # Extract the embedding vector (remaining elements) and convert it to a numpy array of float32 dtype
            coefs = np.asarray(values[1:], dtype='float32')
            # Store the word and its embedding vector in the dictionary
            embedding_index[word] = coefs
    # Return the dictionary containing word embeddings
    return embedding_index

def create_embedding_matrix(embedding_index, tokenizer, embedding_dim):
    # Determine the size of the vocabulary based on the tokenizer's word index
    vocab_size = len(tokenizer.word_index) + 1
    # Initialize an empty matrix to store the embedding vectors for each word in the vocabulary
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    # Iterate through each word in the tokenizer's word index
    for word, i in tokenizer.word_index.items():
        # Retrieve the embedding vector for the word from the embedding index
        embedding_vector = embedding_index.get(word)
        # If the word exists in the embedding index, update the corresponding row in the embedding matrix with its embedding vector
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    # Return the embedding matrix
    return embedding_matrix

# Load GloVe embeddings for English
embedding_index_English = load_embeddings("/kaggle/input/glove6b50dtxt/glove.6B.50d.txt", 50) #Insert Glove Dataset for English
# Create embedding matrix for English
embedding_matrix_English = create_embedding_matrix(embedding_index_English, tokenizer_English, 50)

# Load GloVe embeddings for French
embedding_index_French = load_embeddings("/kaggle/input/cc.fr.300.vec", 300) #Insert Glove Dataset for French
# Create embedding matrix for French
embedding_matrix_French = create_embedding_matrix(embedding_index_French, tokenizer_French, 300)


Model Building

In [None]:
def build_model(vocab_size_English, vocab_size_French, embedding_matrix_English, embedding_matrix_French, max_len):
    # Define embedding layers for English and French using pre-trained embedding matrices
    embedding_layer_English = Embedding(input_dim=vocab_size_English, output_dim=50,
                                        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_English),
                                        trainable=False)
    embedding_layer_French = Embedding(input_dim=vocab_size_French, output_dim=300,
                                       embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_French),
                                       trainable=False)

    # Encoder
    encoder_input = Input(shape=(max_len,))
    encoder_embedding = embedding_layer_English(encoder_input)
    encoder_bi_lstm = Bidirectional(LSTM(units=128, return_sequences=True))(encoder_embedding)
    encoder_output, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(units=128, return_sequences=True, return_state=True))(encoder_bi_lstm)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])

    # Decoder
    decoder_input = Input(shape=(max_len,))
    decoder_embedding = embedding_layer_French(decoder_input)
    decoder_lstm = LSTM(units=256, return_sequences=True)(decoder_embedding, initial_state=[state_h, state_c])
    attention = Attention()([decoder_lstm, encoder_output])
    decoder_concat = Concatenate()([decoder_lstm, attention])
    decoder_output = TimeDistributed(Dense(vocab_size_French, activation='softmax'))(decoder_concat)

    # Create the model
    model = Model([encoder_input, decoder_input], decoder_output)
    return model

# Build the model
model = build_model(len(tokenizer_English.word_index) + 1, len(tokenizer_French.word_index) + 1,
                    embedding_matrix_English, embedding_matrix_French, max_len=30)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])


Model Training

In [None]:
def train_model(model, padded_sequences_train_English, padded_sequences_train_French,
                padded_sequences_val_English, padded_sequences_val_French, epochs=20, batch_size=64):
    # Define EarlyStopping callback to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, min_delta=0.001)
    
    # Train the model
    history = model.fit(
        [padded_sequences_train_English, padded_sequences_train_French],
        padded_sequences_train_French,
        validation_data=([padded_sequences_val_English, padded_sequences_val_French], padded_sequences_val_French),
        verbose=1, batch_size=batch_size, epochs=epochs,
        callbacks=[early_stopping]
    )
    
    return history

# Train the model
history = train_model(model, padded_sequences_train_English, padded_sequences_train_French,
                      padded_sequences_val_English, padded_sequences_val_French)


Model Evaluation

In [None]:
def evaluate_model(model, padded_sequences_test_English, padded_sequences_test_French):
    # Evaluate the model on the testing data
    test_loss, test_accuracy = model.evaluate([padded_sequences_test_English, padded_sequences_test_French], padded_sequences_test_French)
    
    # Print the test loss and accuracy
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")
    
    # Return the test loss and accuracy
    return test_loss, test_accuracy

# Call the evaluate_model function to evaluate the model
test_loss, test_accuracy = evaluate_model(model, padded_sequences_test_English, padded_sequences_test_French)


Translation and BLEU Score Calculation

In [None]:
def translate_sentence(model, tokenizer_English, tokenizer_French, sentence, max_len):
    # Convert the input sentence to a sequence of tokens using the English tokenizer
    sequence = tokenizer_English.texts_to_sequences([sentence])
    # Pad the sequence to ensure uniform length
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    # Predict the translation using the trained model
    prediction = model.predict([padded_sequence, padded_sequence])
    # Convert the predicted sequence of indices to a sentence using the French tokenizer
    predicted_sequence = [np.argmax(vector) for vector in prediction[0]]
    predicted_sentence = tokenizer_French.sequences_to_texts([predicted_sequence])
    return predicted_sentence[0]

def calculate_bleu_score(model, X_test, y_test, tokenizer_English, tokenizer_French, max_len=30):
    # Initialize a list to store the predicted translations
    predicted_translations = []
    # Iterate over each English sentence in the test set
    for english_sentence in X_test:
        # Translate the English sentence to French
        translated_sentence = translate_sentence(model, tokenizer_English, tokenizer_French, english_sentence, max_len)
        # Split the translated sentence into tokens and append it to the list of predicted translations
        predicted_translations.append(translated_sentence.split())

    # Split the true translations into tokens
    true_translations = [sentence.split() for sentence in y_test]
    # Compute the BLEU score using the NLTK library
    bleu_score = corpus_bleu([[ref] for ref in true_translations], predicted_translations)
    # Print the computed BLEU score
    print("BLEU Score:", bleu_score)
    # Return the BLEU score
    return bleu_score


Main Code

In [None]:
def main():
    # Load and clean data
    file_path = "/kaggle/input/dataset-1/fra.txt"  # Path to the dataset file
    df = load_and_clean_data(file_path)  # Load and clean the dataset

    # Split and tokenize data
    (padded_sequences_train_English, padded_sequences_train_French,
     padded_sequences_val_English, padded_sequences_val_French,
     padded_sequences_test_English, padded_sequences_test_French,
     tokenizer_English, tokenizer_French) = split_and_tokenize_data(df, max_len=30)

    # Load embeddings
    glove_path_English = "/kaggle/input/glove6b50dtxt/glove.6B.50d.txt"  # Path to GloVe embeddings for English
    glove_path_French = "/kaggle/input/cc.fr.300.vec"  # Path to GloVe embeddings for French
    embedding_index_English = load_embeddings(glove_path_English, 50)  # Load GloVe embeddings for English
    embedding_index_French = load_embeddings(glove_path_French, 300)  # Load GloVe embeddings for French
    embedding_matrix_English = create_embedding_matrix(embedding_index_English, tokenizer_English, 50)  # Create embedding matrix for English
    embedding_matrix_French = create_embedding_matrix(embedding_index_French, tokenizer_French, 300)  # Create embedding matrix for French

    # Build and compile the model
    model = build_model(len(tokenizer_English.word_index) + 1, len(tokenizer_French.word_index) + 1,
                        embedding_matrix_English, embedding_matrix_French, max_len=30)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    
    # Train the model
    history = train_model(model, padded_sequences_train_English, padded_sequences_train_French,
                          padded_sequences_val_English, padded_sequences_val_French, epochs=20, batch_size=64)

    # Save the trained model
    model_save_path = "translation_model.h5"  # Define the path where the model will be saved
    model.save(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Evaluate the model on the testing data
    test_loss, test_accuracy = evaluate_model(model, padded_sequences_test_English, padded_sequences_test_French)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

    # Calculate BLEU score
    bleu_score = calculate_bleu_score(model, df['English'].iloc[:100], df['French'].iloc[:100], tokenizer_English, tokenizer_French, max_len=30)
    print("BLEU Score:", bleu_score)

# Call the main function to execute the entire pipeline
if __name__ == "__main__":
    main()
