In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "drive/My Drive/Project/EngToFrench.csv"

In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm
!pip install pandas
!pip install textblob
!pip install langdetect

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.p

In [None]:
import pandas as pd
import re
from langdetect import detect
from textblob import TextBlob

# Load the data
file_path = 'EngToFrench.csv'  # Replace with your file path
data = pd.read_csv(path, encoding='latin-1')

# Display first few rows to inspect
print(data.head())


  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!              Cours??!
2                    Run!             Courez??!
3                    Who?                  Qui ?
4                    Wow!          ??a alors??!


In [None]:
def normalize_text(text):
    return text.lower().strip()  # Convert to lowercase and remove extra spaces


In [None]:
def simple_tokenize(text):
    return text.split()  # Tokenize by splitting on whitespace


In [None]:
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', text)  # Remove special characters except spaces


In [None]:
def handle_numbers(text):
    return re.sub(r'\d+', '<number>', text)  # Replace numbers with a placeholder


In [None]:
# Apply text normalization
data['English Normalized'] = data['English words/sentences'].apply(normalize_text)
data['French Normalized'] = data['French words/sentences'].apply(normalize_text)

# Apply tokenization
data['English Tokenized'] = data['English Normalized'].apply(simple_tokenize)
data['French Tokenized'] = data['French Normalized'].apply(simple_tokenize)

# Remove special characters
data['English Cleaned'] = data['English Normalized'].apply(remove_special_characters)
data['French Cleaned'] = data['French Normalized'].apply(remove_special_characters)

# Handle numbers
data['English Numbers Handled'] = data['English Cleaned'].apply(handle_numbers)
data['French Numbers Handled'] = data['French Cleaned'].apply(handle_numbers)


In [None]:
import pickle
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer  # Updated import

# Initialize Tokenizers if they are not already defined
try:
    # Load existing tokenizers
    with open('/content/drive/MyDrive/Project/eng_tokenizer.pkl', 'rb') as f:
        eng_tokenizer = pickle.load(f)
    with open('/content/drive/MyDrive/Project/fr_tokenizer.pkl', 'rb') as f:
        fr_tokenizer = pickle.load(f)
    print("Tokenizers loaded successfully.")
except (FileNotFoundError, EOFError):
    print("Tokenizer files not found or corrupted. Initializing new tokenizers.")
    eng_tokenizer = Tokenizer()
    fr_tokenizer = Tokenizer()
    # Add tokenizer fitting logic if required, e.g.:
    # eng_tokenizer.fit_on_texts(english_sentences)
    # fr_tokenizer.fit_on_texts(french_sentences)

# Save Tokenizers
try:
    with open('/content/drive/MyDrive/Project/eng_tokenizer.pkl', 'wb') as f:
        pickle.dump(eng_tokenizer, f)
    with open('/content/drive/MyDrive/Project/fr_tokenizer.pkl', 'wb') as f:
        pickle.dump(fr_tokenizer, f)
    print("Tokenizers saved successfully.")
except Exception as e:
    print(f"Error saving tokenizers: {e}")

# Ensure `data` is preprocessed and defined
try:
    # Simulate preprocessed data creation if `data` is undefined
    if 'data' not in locals():
        # Example of creating dummy data (replace with your actual preprocessing logic)
        data = pd.DataFrame({
            "English": ["Hello", "How are you?", "Goodbye"],
            "French": ["Bonjour", "Comment ça va?", "Au revoir"]
        })
        print("Dummy data created for demonstration.")

    # Save the preprocessed data to a new file
    data.to_csv('/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv', index=False)

    # Print a sample of the preprocessed data
    print(data.head())
except NameError as e:
    print(f"Error: {e}. Make sure 'data' is defined and contains your preprocessed data.")
except Exception as e:
    print(f"Error with data processing: {e}")



Tokenizers loaded successfully.
Tokenizers saved successfully.
  English words/sentences French words/sentences English Normalized  \
0                     Hi.                 Salut!                hi.   
1                    Run!              Cours??!               run!   
2                    Run!             Courez??!               run!   
3                    Who?                  Qui ?               who?   
4                    Wow!          ??a alors??!               wow!   

  French Normalized English Tokenized  French Tokenized English Cleaned  \
0            salut!             [hi.]          [salut!]              hi   
1         cours??!            [run!]       [cours??!]             run   
2        courez??!            [run!]      [courez??!]             run   
3             qui ?            [who?]          [qui, ?]             who   
4     ??a alors??!            [wow!]  [??a, alors??!]             wow   

  French Cleaned English Numbers Handled French Numbers Han

In [None]:
!python -m textblob.download_corpora


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [None]:
def remove_duplicates(df):
    # If your column contains lists, you can convert them into strings
    for col in df.columns:
        df[col] = df[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

    return df.drop_duplicates()

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return None


In [None]:
def filter_sentence_length(text, min_len=2, max_len=100):
    words = text.split()
    return min_len <= len(words) <= max_len


In [None]:
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())


In [None]:
def check_consistency(row):
    # Ensure both sentences have similar lengths
    eng_len = len(row['English words/sentences'].split())
    fr_len = len(row['French words/sentences'].split())
    return abs(eng_len - fr_len) <= 3


In [None]:
def remove_noise(text):
    # Example: Remove placeholder tokens or extra whitespace
    return text.replace('<number>', '').strip()


In [None]:
# 1. Remove duplicates
data = remove_duplicates(data)


# Convert lists back to strings if needed
data['English words/sentences'] = data['English words/sentences'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)
data['French words/sentences'] = data['French words/sentences'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)


# Apply cleaning functions
data['English Normalized'] = data['English words/sentences'].apply(normalize_text)
data['French Normalized'] = data['French words/sentences'].apply(normalize_text)
data['French words/sentences'] = data['French words/sentences'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)

# 2. Detect language
data['English Language'] = data['English words/sentences'].apply(detect_language)
data['French Language'] = data['French words/sentences'].apply(detect_language)

# Filter out rows where language detection fails
data = data[(data['English Language'] == 'en') & (data['French Language'] == 'fr')]

# 3. Filter short/long sentences
data = data[data['English words/sentences'].apply(lambda x: filter_sentence_length(x))]
data = data[data['French words/sentences'].apply(lambda x: filter_sentence_length(x))]

# 4. Correct spelling errors
data['English Corrected'] = data['English words/sentences'].apply(correct_spelling)
data['French Corrected'] = data['French words/sentences']  # Skip for non-English

# 5. Contextual relevance (skipped as it requires domain knowledge)

# 6. Check consistency
data['Consistent'] = data.apply(check_consistency, axis=1)
data = data[data['Consistent']]

# 7. Remove noise
data['English Cleaned'] = data['English Corrected'].apply(remove_noise)
data['French Cleaned'] = data['French Corrected'].apply(remove_noise)

# 8. Quality control (skipped as it depends on specific requirements)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['English Cleaned'] = data['English Corrected'].apply(remove_noise)


In [None]:
# Save the cleaned data
#data.to_csv('Cleaned_EngToFrench.csv', index=False)
# Save preprocessed data

#data.to_csv('/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv', index=False)
preprocessed_path = '/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv'
data.to_csv(preprocessed_path, index=False)
print(f"Preprocessed data saved to {preprocessed_path}")

# Display a sample of the cleaned data
print(data.head())


Preprocessed data saved to /content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv
    English words/sentences French words/sentences English Normalized  \
78                 Be cool.        Sois d??tendu !           be cool.   
99                 Come in.                Entre !           come in.   
125                Go home.  Rentrez ?ÿ la maison.           go home.   
126                Go home.   Rentre ?ÿ la maison.           go home.   
146                Hold on.        Ne quittez pas.           hold on.   

         French Normalized English Tokenized       French Tokenized  \
78         sois d??tendu !          be cool.        sois d??tendu !   
99                 entre !          come in.                entre !   
125  rentrez ?ÿ la maison.          go home.  rentrez ?ÿ la maison.   
126   rentre ?ÿ la maison.          go home.   rentre ?ÿ la maison.   
146        ne quittez pas.          hold on.        ne quittez pas.   

    English Cleaned         French Cleaned Engl

In [None]:
# Save Preprocessed Data
preprocessed_path = '/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv'
data.to_csv(preprocessed_path, index=False)
print(f"Preprocessed data saved to {preprocessed_path}")

#Load Data
preprocessed_path = '/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv'
data = pd.read_csv(preprocessed_path)
print("Preprocessed data loaded successfully.")


Preprocessed data saved to /content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv
Preprocessed data loaded successfully.


In [None]:
# Install necessary libraries
!pip install nltk spacy pandas textblob langdetect
!python -m spacy download en_core_web_sm
print("Environment setup completed.")

# Mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive')


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Environment setup completed.


In [None]:
data.tail()

Unnamed: 0,English words/sentences,French words/sentences,English Normalized,French Normalized,English Tokenized,French Tokenized,English Cleaned,French Cleaned,English Numbers Handled,French Numbers Handled,English Language,French Language,English Corrected,French Corrected,Consistent
38095,"Give me a coffee, please.","Donnez-moi un caf??, s'il vous plait.","give me a coffee, please.","donnez-moi un caf??, s'il vous plait.","give me a coffee, please.","donnez-moi un caf??, s'il vous plait.","Give me a coffee, please.","Donnez-moi un caf??, s'il vous plait.",give me a coffee please,donnezmoi un caf sil vous plait,en,fr,"Give me a coffee, please.","Donnez-moi un caf??, s'il vous plait.",True
38096,"Give me a coffee, please.","Donnez-moi un caf??, je vous prie.","give me a coffee, please.","donnez-moi un caf??, je vous prie.","give me a coffee, please.","donnez-moi un caf??, je vous prie.","Give me a coffee, please.","Donnez-moi un caf??, je vous prie.",give me a coffee please,donnezmoi un caf je vous prie,en,fr,"Give me a coffee, please.","Donnez-moi un caf??, je vous prie.",True
38097,Give me a glass of water.,"Donnez-moi un verre d'eau, s'il vous pla??t.",give me a glass of water.,"donnez-moi un verre d'eau, s'il vous pla??t.",give me a glass of water.,"donnez-moi un verre d'eau, s'il vous pla??t.",Give me a glass of water.,"Donnez-moi un verre d'eau, s'il vous pla??t.",give me a glass of water,donnezmoi un verre deau sil vous plat,en,fr,Give me a glass of water.,"Donnez-moi un verre d'eau, s'il vous pla??t.",True
38098,Give me a hand with this.,Donne-moi un coup de main pour ??a.,give me a hand with this.,donne-moi un coup de main pour ??a.,give me a hand with this.,donne-moi un coup de main pour ??a.,Give me a hand with this.,Donne-moi un coup de main pour ??a.,give me a hand with this,donnemoi un coup de main pour a,en,fr,Give me a hand with this.,Donne-moi un coup de main pour ??a.,True
38099,Give me a hand with this.,Donnez-moi un coup de main pour ceci.,give me a hand with this.,donnez-moi un coup de main pour ceci.,give me a hand with this.,donnez-moi un coup de main pour ceci.,Give me a hand with this.,Donnez-moi un coup de main pour ceci.,give me a hand with this,donnezmoi un coup de main pour ceci,en,fr,Give me a hand with this.,Donnez-moi un coup de main pour ceci.,True


In [None]:
from google.colab import drive
drive.mount('/content/drive')
#Prepare Data for traning

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer  # Import Tokenizer

# Load the preprocessed data
#data = pd.read_csv('/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv')
data = pd.read_csv('/content/drive/MyDrive/Project/Preprocessed_EngToFrench.csv', encoding='latin-1')

# Initialize the tokenizers (this is the crucial part)
eng_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()

# Tokenize English and French text
eng_tokenizer.fit_on_texts(data['English Cleaned'])
fr_tokenizer.fit_on_texts(data['French Cleaned'])

# Convert text to sequences
eng_sequences = eng_tokenizer.texts_to_sequences(data['English Cleaned'])
fr_sequences = fr_tokenizer.texts_to_sequences(data['French Cleaned'])

# Padding the sequences
max_len = 20  # Maximum length of sequences (adjust as needed)
X_data = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
y_data = pad_sequences(fr_sequences, maxlen=max_len, padding='post')

# Split into training and testing sets (70% for training, 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

# Model parameters
latent_dim = 256  # Latent dimension size
vocab_size = len(eng_tokenizer.word_index) + 1  # Encoder vocab size (English)
target_vocab_size = len(fr_tokenizer.word_index) + 1  # Decoder vocab size (French)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Build the NMT model.
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.optimizers import Adam

# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=target_vocab_size, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs_final = decoder_dense(decoder_outputs)

# Build the full model
nmt_model = Model([encoder_inputs, decoder_inputs], decoder_outputs_final)

# Compile the model with Adam optimizer and sparse categorical crossentropy loss
optimizer = Adam(clipvalue=1.0)  # Gradient clipping
nmt_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
#Prepare Decoding data for Traning
# Prepare decoder input and target output (shifted) with padding
import tensorflow as tf # Import TensorFlow

decoder_input_data = pad_sequences(y_train[:, :-1], maxlen=max_len, padding='post')
decoder_target_data = pad_sequences(y_train[:, 1:], maxlen=max_len, padding='post')

# Train the model
nmt_model.fit(
    [X_train, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),  # Add an extra dimension for the target labels
    epochs=10,  # You can adjust the epochs
    batch_size=64,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]  # Stop early if no improvement
)

Epoch 1/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m919s[0m 3s/step - accuracy: 0.7818 - loss: 2.5242 - val_accuracy: 0.8031 - val_loss: 1.3495
Epoch 2/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m913s[0m 3s/step - accuracy: 0.8063 - loss: 1.3015 - val_accuracy: 0.8128 - val_loss: 1.2545
Epoch 3/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m938s[0m 3s/step - accuracy: 0.8168 - loss: 1.1911 - val_accuracy: 0.8229 - val_loss: 1.1615
Epoch 4/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m926s[0m 3s/step - accuracy: 0.8268 - loss: 1.0811 - val_accuracy: 0.8328 - val_loss: 1.0748
Epoch 5/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m948s[0m 3s/step - accuracy: 0.8395 - loss: 0.9636 - val_accuracy: 0.8424 - val_loss: 0.9948
Epoch 6/10
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m972s[0m 3s/step - accuracy: 0.8495 - loss: 0.8646 - val_accuracy: 0.8501 - val_loss: 0.9341
Epoch 7/10
[1m334/334

<keras.src.callbacks.history.History at 0x79206790ee30>

In [None]:
# Prepare test decoder input and target output
from tensorflow.keras.preprocessing.sequence import pad_sequences
print("Preparing test decoder input and target output...")

decoder_input_test = pad_sequences(y_test[:, :-1], maxlen=max_len, padding='post')  # Decoder inputs
decoder_target_test = pad_sequences(y_test[:, 1:], maxlen=max_len, padding='post')  # Decoder outputs

print("Test data prepared successfully!")


Preparing test decoder input and target output...
Test data prepared successfully!


In [None]:
# Evaluate the model on the test set
print("Evaluating the model on the test data...")

loss, accuracy = nmt_model.evaluate(
    [X_test, decoder_input_test],
    np.expand_dims(decoder_target_test, -1),  # Expand dimension for compatibility
    batch_size=64
)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Evaluating the model on the test data...
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 742ms/step - accuracy: 0.8716 - loss: 0.7620
Test Loss: 0.7658
Test Accuracy: 0.8714


In [None]:
# Save the trained model to a file
model_path = '/content/drive/MyDrive/Project/nmt_model.h5'

nmt_model.save(model_path)  # Save the model in H5 format
print(f"Model saved successfully to {model_path}")




Model saved successfully to /content/drive/MyDrive/Project/nmt_model.h5


In [3]:
# Load the saved model
from tensorflow.keras.models import load_model

print("Loading the model...")
loaded_model = load_model(model_path)

print("Model loaded successfully!")
loaded_model.summary()  # Display the model structure


Loading the model...


NameError: name 'model_path' is not defined

In [4]:
#Define Inference Models
# Extract encoder layers
encoder_inputs = loaded_model.input[0]  # Encoder input
encoder_outputs, state_h, state_c = loaded_model.layers[4].output  # Encoder LSTM outputs and states
encoder_states = [state_h, state_c]

# Define encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Extract decoder layers
latent_dim = 256  # Latent dimension size (same as training)
decoder_inputs = loaded_model.input[1]  # Decoder input
decoder_embedding = loaded_model.layers[3](decoder_inputs)  # Embedding layer
decoder_lstm = loaded_model.layers[5]  # Decoder LSTM layer
decoder_dense = loaded_model.layers[6]  # Dense output layer

# Define placeholders for decoder states
decoder_state_input_h = Input(shape=(latent_dim,), name='input_h')
decoder_state_input_c = Input(shape=(latent_dim,), name='input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse decoder LSTM and Dense layers
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

# Define decoder model for inference
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

print("Inference models created successfully!")


NameError: name 'loaded_model' is not defined

In [None]:
#Prepare Reverse Tokenizer for Decoding
# Reverse-lookup dictionaries for decoding sequences back to words
reverse_eng_index = {v: k for k, v in eng_tokenizer.word_index.items()}  # English
reverse_fr_index = {v: k for k, v in fr_tokenizer.word_index.items()}  # French


In [None]:
#Define the Translation Function
def translate_sentence(input_sentence, max_len=20):
    """
    Translate an input English sentence to French using the trained NMT model.
    """
    # Preprocess the input sentence
    input_sentence = input_sentence.lower().strip()  # Normalize input
    input_sequence = eng_tokenizer.texts_to_sequences([input_sentence])  # Convert to sequence
    input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post')  # Pad the sequence

    # Encode the input sentence to get states
    states_value = encoder_model.predict(input_sequence)

    # Initialize target sequence with <start> token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_tokenizer.word_index['je']  # Replace 'start' with your actual start token

    # Generate the translated sentence
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        # Predict the next word and update states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the word index with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fr_index.get(sampled_token_index, '')

        # Stop if <end> token or max length reached
        if sampled_word == "salutations" or len(decoded_sentence.split()) >= max_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence and states
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [1]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout
import numpy as np
import os

# Tokenizers for English and French
eng_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
french_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

# Check if tokenizer files exist
eng_tokenizer_path = "/content/drive/MyDrive/Project/eng_tokenizer.pkl"
fr_tokenizer_path = "/content/drive/MyDrive/Project/french_tokenizer.pkl"

if os.path.exists(eng_tokenizer_path) and os.path.exists(fr_tokenizer_path):
    print("Tokenizer files found. Loading...")
    # Load tokenizers using pickle
    with open(eng_tokenizer_path, "rb") as eng_file, open(fr_tokenizer_path, "rb") as fr_file:
        eng_tokenizer = pickle.load(eng_file)
        french_tokenizer = pickle.load(fr_file)
else:
    print("Tokenizer files not found in the specified location.")
    print("Please make sure the paths are correct and the files exist.")
    # Handle the case where files are not found (e.g., exit or raise an exception)
    # ... your error handling code here ...


# Load tokenizers (replace these paths with actual tokenizer files)
# Assume you have saved tokenizer configs during training
# Load tokenizers
# Update file paths to your Google Drive location
with open("/content/drive/MyDrive/Project/eng_tokenizer.pkl", "rb") as eng_file, open("/content/drive/MyDrive/Project/fr_tokenizer.pkl", "rb") as fr_file:
    eng_tokenizer = pickle.load(eng_file)  # Load using pickle
    french_tokenizer = pickle.load(fr_file)  # Load using pickle

# ... other code ...

# Create a target sequence with the start token
# Check if 'je' is in word_index, if not, add it
start_token = 'je' # Set start_token to 'je'
if start_token not in french_tokenizer.word_index:
    # Get the next available index (max existing index + 1)
    next_index = max(french_tokenizer.word_index.values(), default=0) + 1
    french_tokenizer.word_index[start_token] = next_index

target_seq = np.zeros((1, 1))
target_seq[0, 0] = french_tokenizer.word_index[start_token]

# Example sentences to translate
example_sentences = [
    "hello",
    "What is your name?",
    "I love learning languages.",
    "Have a great day!"
]

# Function to prepare input sequence for the model
def prepare_sequence(input_sentence, tokenizer, max_length):
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_sequence = pad_sequences(input_sequence, maxlen=max_length, padding="post")
    return padded_sequence

# Function to decode prediction back to text
def decode_sequence(predicted_sequence, tokenizer):
    word_index = {idx: word for word, idx in tokenizer.word_index.items()}
    return " ".join([word_index.get(idx, "") for idx in predicted_sequence if idx != 0])

# Translate each sentence
max_input_length = 20  # Replace with the actual max length used in training
max_output_length = 20  # Replace with the actual max length used in training

for sentence in example_sentences:
    prepared_input = prepare_sequence(sentence.lower(), eng_tokenizer, max_input_length)

    # Get the initial state for the decoder using the encoder model
    initial_state = encoder_model.predict(prepared_input)

    # Create a target sequence with the start token
    # Check if 'start' is in word_index, if not use 'je' as start token
    start_token = '<start>' if '<start>' in french_tokenizer.word_index else 'je'
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = french_tokenizer.word_index[start_token]

    # Generate the translation
    decoded_sentence = ''
    for _ in range(max_output_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + initial_state)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_fr_index.get(sampled_token_index, '')

        # Check for end token or max length
        end_token = '<end>' if '<end>' in french_tokenizer.word_index else 'salutations'
        if sampled_word == end_token or len(decoded_sentence.split()) >= max_output_length:
            break

        decoded_sentence += ' ' + sampled_word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        initial_state = [h, c]

    translation = decoded_sentence.strip()
    print(f"English: {sentence}")
    print(f"French: {translation}\n")

Tokenizer files not found in the specified location.
Please make sure the paths are correct and the files exist.


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Project/eng_tokenizer.pkl'