# CAPSTONE PROJECT 2

## NLP - ENG TO FRE TRANSLATION

## 3) Modeling

In [2]:
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [3]:
# Now I can unpickle the preprocessed sentences from the 2nd Jupyter Notebook for this project: 
#   (CAPSTONE PROJECT 2 - NLP - 2) Processing the data)
import pickle

#read the pickle file
eng2_picklefile = open('preproc_english_sentences.pkl', 'rb')
fre2_picklefile = open('preproc_french_sentences.pkl', 'rb')
mfsl_picklefile = open('max_fre_seq_len.pkl', 'rb')
evs_picklefile = open('eng_vocab_size.pkl', 'rb')
fvs_picklefile = open('fre_vocab_size.pkl', 'rb')
ft_picklefile = open('fre_tokenizer.pkl', 'rb')
et_picklefile = open('eng_tokenizer.pkl', 'rb')
fss_picklefile = open('fre_sentences_subset.pkl', 'rb')
ess_picklefile = open('eng_sentences_subset.pkl', 'rb')

#unpickle the objects
preproc_english_sentences = pickle.load(eng2_picklefile)
preproc_french_sentences = pickle.load(fre2_picklefile)
max_french_sequence_length = pickle.load(mfsl_picklefile)
english_vocab_size = pickle.load(evs_picklefile)
french_vocab_size = pickle.load(fvs_picklefile)
french_tokenizer = pickle.load(ft_picklefile)
english_tokenizer = pickle.load(et_picklefile)
french_sentences_subset = pickle.load(fss_picklefile)
english_sentences_subset = pickle.load(ess_picklefile)

#close files
eng2_picklefile.close()
fre2_picklefile.close()
mfsl_picklefile.close()
evs_picklefile.close()
fvs_picklefile.close()
ft_picklefile.close()
et_picklefile.close()
fss_picklefile.close()
ess_picklefile.close()

### RNN with Embedding

In [4]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [9]:
# Importing my_pad module from NLP helper functions
from nlp_helper_functions import my_pad

from termcolor import colored

tmp_x = my_pad.pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
print(colored('\ninput_shape:','green',attrs=['bold']),tmp_x.shape)
print(colored('output_sequence_length:','green',attrs=['bold']),max_french_sequence_length)
print(colored('\nenglish_vocab_size:','green',attrs=['bold']),english_vocab_size)
print(colored('french_vocab_size:','green',attrs=['bold']),french_vocab_size)

[1m[32m
input_shape:[0m (6000, 138)
[1m[32moutput_sequence_length:[0m 138
[1m[32m
english_vocab_size:[0m 8753
[1m[32mfrench_vocab_size:[0m 12185


### Embedding Model

In [6]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a RNN model using word embedding on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    
    learning_rate = 1e-3
    
    embedding_dim = 512
    input_seq = Input(input_shape[1:])
    embedding = Embedding(english_vocab_size, embedding_dim)(input_seq)
    
    rnn = GRU(64, return_sequences=True)(embedding)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    
    model.compile(loss=sparse_categorical_crossentropy,
                 optimizer=Adam(learning_rate),
                 metrics=['accuracy'])    
   
    return model

# Reshape the input
tmp_x = my_pad.pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# Train the neural network
embed_rnn_model = embed_model(tmp_x.shape, max_french_sequence_length, english_vocab_size, french_vocab_size)
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
reprise de de session <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [10]:
french_sentences_subset[8]

'Vous avez probablement appris par la presse et par la télévision que plusieurs attentats à la bombe et crimes ont été perpétrés au Sri Lanka.'

In [11]:
english_sentences_subset[8]

'You will be aware from the press and television that there have been a number of bomb explosions and killings in Sri Lanka.'

### Final Predictions

In [9]:
def final_predictions(x, y, x_tk, y_tk):
    """
    Gets predictions using the final model
    :param x: Preprocessed English data
    :param y: Preprocessed French data
    :param x_tk: English tokenizer
    :param y_tk: French tokenizer
    """
    
    # Train neural network using model_final
    x = pad_sequences(x, y.shape[1], padding='post')
    model = embed_model(x.shape, y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    model.fit(x, y, batch_size=1, epochs=10, validation_split=0.2)

    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'You have requested a debate on this subject in the course of the next few days'
    sentence = [x_tk.word_index[word.lower()] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    print(len(sentences), sentences.shape)
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Vous avez souhaité un débat à ce sujet dans les prochains jours')
    print('\nSample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))
    
final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2 (2, 138)
Sample 1:
vous vous de un débat sur ce sujet sujet sujet ce <PAD> ce de de de de <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Vous avez souhaité un débat à ce sujet dans les prochains jours

Sample

## CONCLUSION:

Although I was able to use more processing power with the use of a GPU and a better CPU setup by using a testing account I generated on Google Cloud Computing, the processing speed was not that great of an improvement over running this project on my own machine. The translation model performs well with short sentences, but it is having difficulty with longer sentences. This could be due to the fact that I had to subset the number of records (sentences) per language file to 6000 each. I would imagine that with more processing power and the ability to run this modeling on a substantially greater number of sentences, I would get better results than what I am currently seeing.  I may be able to tweak the modeling parameters to have it perform better as well.