<a href="https://colab.research.google.com/github/SOBIKA-G/Machine-Translation/blob/main/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Load the dataset (replace 'your_dataset.csv' with the actual dataset path)
data = pd.read_csv('/content/drive/MyDrive/eng_-french.csv', header=None, names=['English', 'French'])

# Extract English and French sentences from the dataset (first 1000 rows)
english_sentences = data['English'][:5000]
french_sentences = data['French'][:5000]
print(data.head())

                   English                  French
0  English words/sentences  French words/sentences
1                      Hi.                  Salut!
2                     Run!                 Cours !
3                     Run!                Courez !
4                     Who?                   Qui ?


In [20]:
data.isna().sum()

Unnamed: 0,0
English,0
French,0


In [21]:
data.describe()

Unnamed: 0,English,French
count,175622,175622
unique,123101,165976
top,I can't tell you how happy I am that you've co...,Comment cela se peut-il ?
freq,32,9


In [22]:
data.shape

(175622, 2)

In [23]:
# Preprocess the data
source_tokenizer = tf.keras.preprocessing.text.Tokenizer()
target_tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit the tokenizers on the data
source_tokenizer.fit_on_texts(english_sentences)
target_tokenizer.fit_on_texts(french_sentences)

# Convert sentences to sequences
source_sequences = source_tokenizer.texts_to_sequences(english_sentences)
target_sequences = target_tokenizer.texts_to_sequences(french_sentences)

# Pad the sequences to the same length
max_source_length = max(len(seq) for seq in source_sequences)
max_target_length = max(len(seq) for seq in target_sequences)

source_padded = pad_sequences(source_sequences, maxlen=max_source_length, padding='post')
target_padded = pad_sequences(target_sequences, maxlen=max_target_length, padding='post')

# Define the model architecture (Seq2Seq with LSTM)
embedding_dim = 256
latent_dim = 512
vocab_size_source = len(source_tokenizer.word_index) + 1
vocab_size_target = len(target_tokenizer.word_index) + 1

# Encoder
encoder_input = Input(shape=(max_source_length,))
encoder_embedding = Embedding(vocab_size_source, embedding_dim)(encoder_input)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_input = Input(shape=(max_target_length-1,))  # Adjust target length for input to decoder
decoder_embedding = Embedding(vocab_size_target, embedding_dim)(decoder_input)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Compile the model with SparseCategoricalCrossentropy
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare shifted target sequences for training
target_padded_shifted = target_padded[:, :-1]
target_padded_labels = target_padded[:, 1:]

# Train the model
model.fit([source_padded, target_padded_shifted], np.expand_dims(target_padded_labels, -1), batch_size=8, epochs=5)

accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy()


source_test_padded = source_padded[:1000]
target_test_padded_shifted = target_padded[:1000, :-1]
target_test_labels = target_padded[:1000, 1:]

# Create test dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "source": source_test_padded,
        "target": target_test_padded_shifted,
    },
    np.expand_dims(target_test_labels, -1)
)).batch(8)

# Evaluate the model on the test data
for inputs, labels in test_dataset:
    # Make predictions
    predictions = model([inputs["source"], inputs["target"]], training=False)

    # Update the accuracy metric
    accuracy_metric.update_state(labels, predictions)

# Get the accuracy result
accuracy = accuracy_metric.result().numpy()
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 251ms/step - accuracy: 0.7612 - loss: 2.1419
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 250ms/step - accuracy: 0.7987 - loss: 1.3004
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 255ms/step - accuracy: 0.8234 - loss: 1.0400
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 245ms/step - accuracy: 0.8421 - loss: 0.8241
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 248ms/step - accuracy: 0.8641 - loss: 0.6291
Test Accuracy: 0.9163


In [24]:
model.summary()


In [25]:
model.save('translation_model.h5')



In [26]:
import pickle

# Save the source tokenizer
with open("source_tokenizer.pkl", "wb") as source_file:
    pickle.dump(source_tokenizer, source_file)

# Save the target tokenizer
with open("target_tokenizer.pkl", "wb") as target_file:
    pickle.dump(target_tokenizer, target_file)



In [27]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the model and tokenizers
def load_model_and_tokenizers():
    # Load the trained model
    model = tf.keras.models.load_model("/content/drive/MyDrive/translation_model.h5")

    # Load the tokenizers
    with open("/content/source_tokenizer.pkl", "rb") as source_file:
        source_tokenizer = pickle.load(source_file)

    with open("/content/target_tokenizer.pkl", "rb") as target_file:
        target_tokenizer = pickle.load(target_file)

    return model, source_tokenizer, target_tokenizer

# Load the model and tokenizers globally
model, source_tokenizer, target_tokenizer = load_model_and_tokenizers()

# Define max lengths (adjust according to model's expected length)
max_source_length = 4  # Adjusted to match the model's expected input length
max_target_length = 10  # Adjust as per your training setup

# Translation function
def translate_sentence(input_sentence):
    # Tokenize and pad the input sentence to max_length = 4 (as per model requirement)
    input_sequence = source_tokenizer.texts_to_sequences([input_sentence])
    input_padded = pad_sequences(input_sequence, maxlen=max_source_length, padding='post')

    # Initialize target sequence for decoding (with the <start> token)
    target_sequence = np.zeros((1, max_target_length - 1))  # excluding the <start> token
    start_token = target_tokenizer.word_index.get('<start>', 1)
    end_token = target_tokenizer.word_index.get('<end>', 0)

    target_sequence[0, 0] = start_token

    # Prepare to generate translation
    predicted_sequence = []
    for i in range(1, max_target_length):
        # The model expects both the source and target input sequences
        output = model.predict([input_padded, target_sequence], verbose=0)
        predicted_id = np.argmax(output[0, i - 1, :])

        # Stop if the <end> token is predicted
        if predicted_id == end_token:
            break

        predicted_sequence.append(predicted_id)
        target_sequence[0, i] = predicted_id

    # Convert predicted token IDs to words
    translated_sentence = ' '.join(target_tokenizer.index_word.get(id, '') for id in predicted_sequence if id > 0)
    return translated_sentence

# Get input from the user
input_sentence = input("Enter an English sentence: ")

# Translate and print the output
translated_sentence = translate_sentence(input_sentence)
print(f"Translated Sentence: {translated_sentence}")




Enter an English sentence: hello
Translated Sentence: fume


In [34]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import tensorflow as tf

nltk.download('punkt')
nltk.download('punkt_tab')

def compute_bleu(reference, predicted):
    reference_tokens = nltk.word_tokenize(reference)
    predicted_tokens = nltk.word_tokenize(predicted)

    if len(predicted_tokens) == 0:
        return 0.0

    return sentence_bleu([reference_tokens], predicted_tokens)


def evaluate_bleu(input_sentences, target_sentences, model, source_tokenizer, target_tokenizer, max_source_length, max_target_length):

    def translate_sentence(input_sentence, model, source_tokenizer, target_tokenizer, max_source_length, max_target_length):
        input_sequence = source_tokenizer.texts_to_sequences([input_sentence])
        input_padded = pad_sequences(input_sequence, maxlen=max_source_length, padding='post')

        target_sequence = np.zeros((1, max_target_length - 1))
        start_token = target_tokenizer.word_index.get('<start>', 1)
        end_token = target_tokenizer.word_index.get('<end>', 0)

        target_sequence[0, 0] = start_token

        predicted_sequence = []
        for i in range(1, max_target_length):
            output = model.predict([input_padded, target_sequence], verbose=0)
            predicted_id = np.argmax(output[0, i - 1, :])

            if predicted_id == end_token:
                break

            predicted_sequence.append(predicted_id)
            target_sequence[0, i] = predicted_id

        translated_sentence = ' '.join(target_tokenizer.index_word.get(id, '') for id in predicted_sequence if id > 0)
        return translated_sentence

    total_bleu_score = 0.0
    for i in range(len(input_sentences)):
        input_sentence = input_sentences[i]
        reference_translation = target_sentences[i]

        predicted_translation = translate_sentence(input_sentence, model, source_tokenizer, target_tokenizer, max_source_length, max_target_length)

        bleu_score = compute_bleu(reference_translation, predicted_translation)
        total_bleu_score += bleu_score

        print(f"Input: {input_sentence}")
        print(f"Reference: {reference_translation}")
        print(f"Translated: {predicted_translation}")
        print(f"BLEU: {bleu_score}")
        print("-" * 20)

    average_bleu_score = total_bleu_score / len(input_sentences)
    return average_bleu_score


# Example usage (in your new cell):

model_path = "/content/drive/MyDrive/translation_model.h5"  # Replace with your actual path
source_tokenizer_path = "/content/source_tokenizer.pkl"  # Replace with your actual path
target_tokenizer_path = "/content/target_tokenizer.pkl"  # Replace with your actual path
test_data_path = '/content/drive/MyDrive/eng_-french.csv' # Replace with your actual path

model = tf.keras.models.load_model(model_path)
with open(source_tokenizer_path, "rb") as source_file:
    source_tokenizer = pickle.load(source_file)
with open(target_tokenizer_path, "rb") as target_file:
    target_tokenizer = pickle.load(target_file)

max_source_length = 4  # Replace with your actual value
max_target_length = 10  # Replace with your actual value

test_data = pd.read_csv(test_data_path, header=None, names=['English', 'French'])

test_data_sliced = test_data.iloc[5000:5100] # or whatever your test range is
test_data_sliced = test_data_sliced.reset_index(drop=True) # Important: Reset the index!

test_english_sentences = test_data_sliced['English']
test_french_sentences = test_data_sliced['French']

average_bleu = evaluate_bleu(test_english_sentences, test_french_sentences, model, source_tokenizer, target_tokenizer, max_source_length, max_target_length)
print(f"Average BLEU Score on Test Data: {average_bleu}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Input: Don't deny it.
Reference: Ne le nie pas !
Translated: le lirai
BLEU: 3.418291552750845e-232
--------------------
Input: Don't deny it.
Reference: Ne le niez pas !
Translated: le lirai
BLEU: 3.418291552750845e-232
--------------------
Input: Don't despair.
Reference: Ne désespérez pas !
Translated: te prie
BLEU: 0
--------------------
Input: Don't despair.
Reference: Ne désespère pas !
Translated: te prie
BLEU: 0
--------------------
Input: Don't do that.
Reference: Ne fais pas cela.
Translated: le déteste
BLEU: 0
--------------------
Input: Don't do that.
Reference: Ne fais pas ça.
Translated: le déteste
BLEU: 0
--------------------
Input: Don't do that.
Reference: Ne faites pas ça.
Translated: le déteste
BLEU: 0
--------------------
Input: Don't do this.
Reference: Ne fais pas ça !
Translated: le déteste
BLEU: 0
--------------------
Input: Don't do this.
Reference: Ne faites pas ça !
Translated: le déteste
BLEU: 0
--------------------
Input: Don't get fat.
Reference: Ne deviens