# CAPSTONE PROJECT 2

## NLP - ENG TO FRE TRANSLATION

## 2) Processing the data

In [22]:
import numpy as np

In [1]:
# Now I can unpickle the sentences from the 1st Jupyter Notebook for this project: 
#   (CAPSTONE PROJECT 2 - NLP - 1) Environment and Data Set Up
import pickle

#read the pickle file
eng_picklefile = open('eng_sentences', 'rb')
fre_picklefile = open('fre_sentences', 'rb')

#unpickle the dataframes
eng_sentences = pickle.load(eng_picklefile)
fre_sentences = pickle.load(fre_picklefile)

#close files
eng_picklefile.close()
fre_picklefile.close()

In [2]:
from termcolor import colored

# Taking a look at a few examples of an English sentence and its French counterpart
for sample_i in range(3):
    print(colored('English Sentence {} :  {}'.format(sample_i+1, eng_sentences[sample_i]),'green', attrs=['bold']))
    print(colored('French Sentence {}  :  {}\n'.format(sample_i+1, fre_sentences[sample_i]), 'blue', attrs=['bold']))

[1m[32mEnglish Sentence 1 :  Resumption of the session[0m
[1m[34mFrench Sentence 1  :  Reprise de la session
[0m
[1m[32mEnglish Sentence 2 :  I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.[0m
[1m[34mFrench Sentence 2  :  Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
[0m
[1m[32mEnglish Sentence 3 :  Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.[0m
[1m[34mFrench Sentence 3  :  Comme vous avez pu le constater, le grand "bogue de l'an 2000" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont 

### 2.1) Subsetting the Data

In [48]:
# Subsetting the lists of sentences for ease of processing
eng_sentences_subset = eng_sentences[:6000]
fre_sentences_subset = fre_sentences[:6000]

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.']

### 2.2) Tokenizing

In [20]:
# Tokenize Example output

# Importing my_tokenize module from NLP helper functions
from nlp_helper_functions import my_tokenize

text_sentences = [
    'I cannot believe what you went through .',
    'He who is without sin cast the first stone .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = my_tokenize.tokenize(text_sentences)

# Printing the word_index based on the above three sentences
print(colored('\nWord index dictionary:','green',attrs=['bold']),text_tokenizer.word_index)

# Printing the input sentence followed by the output tokens corresponding to each word
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print(colored('\nSequence {} in x'.format(sample_i + 1),'green',attrs=['bold']))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

[1m[32m
Word index dictionary:[0m {'is': 1, 'i': 2, 'cannot': 3, 'believe': 4, 'what': 5, 'you': 6, 'went': 7, 'through': 8, 'he': 9, 'who': 10, 'without': 11, 'sin': 12, 'cast': 13, 'the': 14, 'first': 15, 'stone': 16, 'this': 17, 'a': 18, 'short': 19, 'sentence': 20}
[1m[32m
Sequence 1 in x[0m
  Input:  I cannot believe what you went through .
  Output: [2, 3, 4, 5, 6, 7, 8]
[1m[32m
Sequence 2 in x[0m
  Input:  He who is without sin cast the first stone .
  Output: [9, 10, 1, 11, 12, 13, 14, 15, 16]
[1m[32m
Sequence 3 in x[0m
  Input:  This is a short sentence .
  Output: [17, 1, 18, 19, 20]


### 2.3) Padding

In [25]:
# Pad Tokenized output

# Importing my_pad module from NLP helper functions
from nlp_helper_functions import my_pad

test_pad = my_pad.pad(text_tokenized)

# Printing the input tokens followed by the output tokens with padding
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print(colored('\nSequence {} in x'.format(sample_i + 1),'green',attrs=['bold']))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

[1m[32m
Sequence 1 in x[0m
  Input:  [2 3 4 5 6 7 8]
  Output: [2 3 4 5 6 7 8 0 0]
[1m[32m
Sequence 2 in x[0m
  Input:  [ 9 10  1 11 12 13 14 15 16]
  Output: [ 9 10  1 11 12 13 14 15 16]
[1m[32m
Sequence 3 in x[0m
  Input:  [17  1 18 19 20]
  Output: [17  1 18 19 20  0  0  0  0]


### 2.4) Generating a preprocess pipeline

In [44]:
# Importing my_preprocess module from NLP helper functions
from nlp_helper_functions import my_preprocess

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    my_preprocess.preprocess(eng_sentences_subset, fre_sentences_subset)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1

print(colored('\nData Preprocessed\n','green',attrs=['bold']))
print(colored("Max English sentence length:",'blue',attrs=['bold']), max_english_sequence_length)
print(colored("Max French sentence length:",'blue',attrs=['bold']), max_french_sequence_length)
print(colored("\nEnglish vocabulary size:",'blue',attrs=['bold']), english_vocab_size)
print(colored("French vocabulary size:",'blue',attrs=['bold']), french_vocab_size)

[1m[32m
Data Preprocessed
[0m
[1m[34mMax English sentence length:[0m 126
[1m[34mMax French sentence length:[0m 138
[1m[34m
English vocabulary size:[0m 8753
[1m[34mFrench vocabulary size:[0m 12185


In [12]:
# We will now pickle some of the objects to reuse later
import pickle

#create a file
eng2_picklefile = open('preproc_english_sentences.pkl', 'wb')
fre2_picklefile = open('preproc_french_sentences.pkl', 'wb')
mfsl_picklefile = open('max_fre_seq_len.pkl', 'wb')
evs_picklefile = open('eng_vocab_size.pkl', 'wb')
fvs_picklefile = open('fre_vocab_size.pkl', 'wb')
ft_picklefile = open('fre_tokenizer.pkl', 'wb')
et_picklefile = open('eng_tokenizer.pkl', 'wb')
fss_picklefile = open('fre_sentences_subset.pkl', 'wb')
ess_picklefile = open('eng_sentences_subset.pkl', 'wb')

#pickle the dataframe
pickle.dump(preproc_english_sentences, eng2_picklefile)
pickle.dump(preproc_french_sentences, fre2_picklefile)
pickle.dump(max_french_sequence_length, mfsl_picklefile)
pickle.dump(english_vocab_size, evs_picklefile)
pickle.dump(french_vocab_size, fvs_picklefile)
pickle.dump(french_tokenizer, ft_picklefile)
pickle.dump(english_tokenizer, et_picklefile)
pickle.dump(fre_sentences_subset, fss_picklefile)
pickle.dump(eng_sentences_subset, ess_picklefile)

#close file
eng2_picklefile.close()
fre2_picklefile.close()
mfsl_picklefile.close()
evs_picklefile.close()
fvs_picklefile.close()
ft_picklefile.close()
et_picklefile.close()
fss_picklefile.close()
ess_picklefile.close()

## Looking at an example sentence's words and corresponding tokens and padding

### Sentence

In [50]:
# Looking at an example of a sentence.
eng_sentences_subset[8]

'You will be aware from the press and television that there have been a number of bomb explosions and killings in Sri Lanka.'

### Sentence Tokens and Padding

In [14]:
# Looking at the same sentence's tokens
preproc_english_sentences[8]

array([  34,   22,   13,  391,   43,    1, 1212,    4, 2217,    6,   32,
         19,   40,    8,  266,    2, 2481, 5315,    4, 4011,    5, 2799,
       3281,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int32)