<a href="https://colab.research.google.com/github/Sunday-Okey/NLP_Project_Machine_Translation/blob/master/Project_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the libraries
import collections
import re
import numpy as np


from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout
from tensorflow.keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import os
import string
import requests
from pickle import load
from pickle import dump
from collections import Counter
# import project_tests as tests
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [None]:
french_sentences = load_clean_sentences('french.pkl')
english_sentences = load_clean_sentences('english.pkl')

In [None]:
len(french_sentences)

328245

In [None]:
len(english_sentences)

328245

In [None]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn and it is snowy in april
small_vocab_fr Line 1:  new jersey est parfois calme pendant l automne et il est neigeux en avril
small_vocab_en Line 2:  the united states is usually chilly during july and it is usually freezing in november
small_vocab_fr Line 2:  les etatsunis est generalement froid en juillet et il gele habituellement en novembre


In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

2744485 English words.
21553 unique English words.
10 Most common words in the English dataset:
"is" "the" "it" "in" "during" "and" "but" "never" "sometimes" "usually"

2961971 French words.
31712 unique French words.
10 Most common words in the French dataset:
"est" "en" "il" "la" "les" "le" "et" "mais" "de" "a"


In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    
    # Create the tokeninzer
    t = Tokenizer()
    # Create dictionary mapping words (str) to their rank/index (int)
    t.fit_on_texts(x)
    # Use the tokenizer to tokenize the text
    text_sequences = t.texts_to_sequences(x)
    return text_sequences, t



# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [None]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    
    # If length equals None, set it to be the length of the longest sequence in x
    if length == None:
        length = len(max(x, key=len))
        
    # Use Keras's pad_sequences to pad the sequences with 0's
    padded_sequences = pad_sequences(sequences=x, maxlen=length, padding='post', value=0)
    
    return padded_sequences



# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [None]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 95
English vocabulary size: 21553
French vocabulary size: 31712


In [None]:
english_sentences_train, english_sentences_test, french_sentences_train, french_sentences_test =\
    train_test_split(english_sentences, french_sentences, test_size=0.2, random_state=42)

In [None]:
# Calculate the training set stats

english_train_words_counter = collections.Counter([word for sentence in english_sentences_train for word in sentence.split()])
french_train_words_counter = collections.Counter([word for sentence in french_sentences_train for word in sentence.split()])

print('{} English words in the training set.'.format(len([word for sentence in english_sentences_train for word in sentence.split()])))
print('{} unique English words in the training set.'.format(len(english_train_words_counter)))
print('10 Most common words in the English training dataset:')
print('"' + '" "'.join(list(zip(*english_train_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words in the training set.'.format(len([word for sentence in french_sentences_train for word in sentence.split()])))
print('{} unique French words in the training set.'.format(len(french_train_words_counter)))
print('10 Most common words in the French training dataset:')
print('"' + '" "'.join(list(zip(*french_train_words_counter.most_common(10)))[0]) + '"')

2195514 English words in the training set.
19768 unique English words in the training set.
10 Most common words in the English training dataset:
"is" "the" "it" "in" "during" "and" "but" "sometimes" "never" "usually"

2368521 French words in the training set.
28904 unique French words in the training set.
10 Most common words in the French training dataset:
"est" "en" "il" "la" "les" "le" "et" "mais" "de" "a"


In [None]:
# Calculate the test set stats

english_test_words_counter = collections.Counter([word for sentence in english_sentences_test for word in sentence.split()])
french_test_words_counter = collections.Counter([word for sentence in french_sentences_test for word in sentence.split()])

print('{} English words in the test set.'.format(len([word for sentence in english_sentences_test for word in sentence.split()])))
print('{} unique English words in the test set.'.format(len(english_test_words_counter)))
print('10 Most common words in the English test dataset:')
print('"' + '" "'.join(list(zip(*english_test_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words in the test set.'.format(len([word for sentence in french_sentences_test for word in sentence.split()])))
print('{} unique French words in the test set.'.format(len(french_test_words_counter)))
print('10 Most common words in the French test dataset:')
print('"' + '" "'.join(list(zip(*french_test_words_counter.most_common(10)))[0]) + '"')

548971 English words in the test set.
10959 unique English words in the test set.
10 Most common words in the English test dataset:
"is" "the" "it" "in" "during" "and" "but" "usually" "never" "sometimes"

593450 French words in the test set.
15521 unique French words in the test set.
10 Most common words in the French test dataset:
"est" "en" "il" "la" "les" "le" "et" "mais" "de" "a"


In [None]:
# Preprocess the training split of the data

preproc_english_sentences_train, preproc_french_sentences_train, english_tokenizer_train, french_tokenizer_train =\
    preprocess(english_sentences_train, french_sentences_train)
    
max_english_sequence_length_train = preproc_english_sentences_train.shape[1]
max_french_sequence_length_train = preproc_french_sentences_train.shape[1]
english_vocab_size_train = len(english_tokenizer_train.word_index)
french_vocab_size_train = len(french_tokenizer_train.word_index)

print('Training Data Preprocessed')
print("Max English train dataset sentence length:", max_english_sequence_length_train)
print("Max French train dataset sentence length:", max_french_sequence_length_train)
print("English train dataset vocabulary size:", english_vocab_size_train)
print("French train dataset vocabulary size:", french_vocab_size_train)

Training Data Preprocessed
Max English train dataset sentence length: 15
Max French train dataset sentence length: 95
English train dataset vocabulary size: 19768
French train dataset vocabulary size: 28904


In [None]:
# Preprocess the test split of the data 

preproc_english_sentences_test, preproc_french_sentences_test, english_tokenizer_test, french_tokenizer_test =\
    preprocess(english_sentences_test, french_sentences_test)
    
max_english_sequence_length_test = preproc_english_sentences_test.shape[1]
max_french_sequence_length_test = preproc_french_sentences_test.shape[1]
english_vocab_size_test = len(english_tokenizer_test.word_index)
french_vocab_size_test = len(french_tokenizer_test.word_index)

print('Test Data Preprocessed')
print('Max English test dataset sentence length:', max_english_sequence_length_test)
print('Max French test dataset sentence length:', max_french_sequence_length_test)
print('English test datset vocab size:', english_vocab_size_test)
print('French test dataset vocab size', french_vocab_size_test)
print(preproc_french_sentences_test.shape)

Test Data Preprocessed
Max English test dataset sentence length: 15
Max French test dataset sentence length: 91
English test datset vocab size: 10959
French test dataset vocab size 15521
(65649, 91, 1)


In [None]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [None]:
### Train the encoder-decoder model on the training split of the dataset ###

def encdec_model_train(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    # Build the encoder-decoder RNN model
    input_layer= Input(shape=input_shape[1:])
    encoder = Bidirectional(GRU(256, recurrent_dropout=0.2))(input_layer)
    repeat_vector = RepeatVector(output_sequence_length)(encoder)
    decoder = Bidirectional(GRU(256, return_sequences=True, recurrent_dropout=0.2))(repeat_vector)
    dense_layer = Dense(french_vocab_size, activation='relu')(decoder)
    output_layer = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    # Compile the model
    learning_rate = 0.01
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])
    
    return model

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', patience = 5,  verbose=1)

In [None]:
# Reshape the input
tmp_x = pad(preproc_english_sentences_train, max_french_sequence_length_train)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences_train.shape[-2], 1))

# Train the model
encoder_decoder_model_train = encdec_model_train(tmp_x.shape,
                                     max_french_sequence_length_train,
                                     english_vocab_size_train + 1,
                                     french_vocab_size_train + 1)
encoder_decoder_model_train.fit(tmp_x, preproc_french_sentences_train, callbacks=[es], batch_size=200, epochs=2, validation_split=0.2)

# Print the prediction(s)
print(logits_to_text(encoder_decoder_model_train.predict(tmp_x[:5])[0], french_tokenizer_train))



Epoch 1/2


  super().__init__(name, **kwargs)


Epoch 2/2
ils a a a <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
test_x = pad(preproc_english_sentences_test, max_french_sequence_length_test)

In [None]:
preproc_english_sentences_test.shape

(65649, 15)

In [None]:
preproc_french_sentences_train.shape

(262596, 95, 1)

In [None]:
# Shape the data to fit
test_x = pad(preproc_english_sentences_test, max_french_sequence_length_train)
test_x = test_x.reshape((-1, max_french_sequence_length_train, 1))
preproc_french_sentences_test = pad(preproc_french_sentences_test, max_french_sequence_length_train)  # Pad preproc_french_sentences_test
preproc_french_sentences_test = preproc_french_sentences_test.reshape((-1, max_french_sequence_length_train, 1))  # Reshape using max_french_sequence_length_train

print(test_x.shape)
print(preproc_french_sentences_train.shape)
print(preproc_french_sentences_test.shape)

# Evaluate the model
encoder_decoder_model_score = encoder_decoder_model_train.evaluate(test_x, preproc_french_sentences_test, verbose=0)

print("Encoder-decoder model accuracy on the unseen test data: {0:.2f}%".format(encoder_decoder_model_score[1]*100))


(65649, 95, 1)
(262596, 95, 1)
(65649, 95, 1)
Encoder-decoder model accuracy on the unseen test data: 92.34%


In [None]:
print(logits_to_text(encoder_decoder_model_train.predict(test_x[:4])[3], french_tokenizer_train))
print('\n')
print(french_sentences_test[:4][3])

je crois est <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


cette proposition est a la fois complexe et sujette a controverse


In [None]:
# Save the model to a file
encoder_decoder_model_train.save('my_model.h5')

65649

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Generate predictions using the encoder-decoder model
predictions = encoder_decoder_model_train.predict(test_x[:10])

# Convert predictions from one-hot encoded vectors to integer indices
predicted_indices = np.argmax(predictions, axis=2)

# Convert integer indices to words
predicted_sentences = []
for indices in predicted_indices:
    predicted_sentence = ' '.join([french_tokenizer.word_index[i] for i in indices if i > 0])  # Exclude padding and start-of-sequence tokens
    predicted_sentences.append(predicted_sentence)

# Convert target sentences from one-hot encoded vectors to integer indices
target_indices = np.argmax(preproc_french_sentences_test, axis=2)

# Convert integer indices to words
target_sentences = []
for indices in target_indices:
    target_sentence = ' '.join([french_tokenizer[i] for i in indices if i > 0])  # Exclude padding and start-of-sequence tokens
    target_sentences.append(target_sentence)

# Calculate BLEU score
bleu_score = corpus_bleu([[sent] for sent in target_sentences], predicted_sentences)
print("BLEU score: {0:.4f}".format(bleu_score))




KeyError: ignored