In [2]:
with open("french-english.txt", "r") as file:
    lines = file.readlines()
    pairs = [line.strip().split("\t") for line in lines]
    # clean the text in each pair
    pairs = [[clean_text(pair[0]), clean_text(pair[1])] for pair in pairs]


FileNotFoundError: [Errno 2] No such file or directory: 'french-english.txt'

In [1]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Split the data into train and test sets
train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

# Create a tokenizer for the source language
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts([pair[0] for pair in train_pairs])
source_vocab_size = len(source_tokenizer.word_index) + 1

# Create a tokenizer for the target language
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts([pair[1] for pair in train_pairs])
target_vocab_size = len(target_tokenizer.word_index) + 1

# Define the maximum sequence length for input and output sequences
max_length_input = max([len(pair[0].split()) for pair in train_pairs])
max_length_output = max([len(pair[1].split()) for pair in train_pairs])

# Encode and pad the input sequences
train_input_sequences = source_tokenizer.texts_to_sequences([pair[0] for pair in train_pairs])
train_input_data = pad_sequences(train_input_sequences, maxlen=max_length_input)

# Encode and pad the output sequences
train_output_sequences = target_tokenizer.texts_to_sequences([pair[1] for pair in train_pairs])
train_output_data = pad_sequences(train_output_sequences, maxlen=max_length_output)

# One-hot encode the output sequences
train_output_data = to_categorical(train_output_data)


ModuleNotFoundError: No module named 'keras'

In [None]:
# Sample code for step 2


from keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from keras.models import Sequential

# Define the model
model = Sequential()

# Add the encoder layers
model.add(Embedding(source_vocab_size, 128, input_length=max_length_input))
model.add(LSTM(256, return_sequences=True))

# Add the repeat vector layer
model.add(RepeatVector(max_length_output))

# Add the decoder layers
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(target_vocab_size, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_input_data, train_output_data, epochs=50, batch_size=32)



In [None]:
# Sample code for step 3
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm

# Encode and pad the test input sequences
test_input_sequences = source_tokenizer.texts_to_sequences([pair[0] for pair in test_pairs])
test_input_data = pad_sequences(test_input_sequences, maxlen=max_length_input)

# Generate predictions for test data
predictions = model.predict(test_input_data)

# Convert predictions to words
predictions = target_tokenizer.sequences_to_texts(predictions.argmax(axis=-1))

# Initialize the BLEU score
bleu_score = 0

# Iterate through the test pairs and calculate BLEU score
for i in tqdm(range(len(test_pairs))):
    reference = test_pairs[i][1].split()
    candidate = predictions[i].split()
    bleu_score += sentence_bleu([reference], candidate)

# Average the BLEU score
bleu_score /= len(test_pairs)
print("BLEU Score: ", bleu_score)


In [None]:
import re
def clean_text(text):
    # remove non-printable characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # remove punctuations and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # convert to lowercase
    text = text.lower()
    return text