In [6]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import os
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import unicodedata 
import re
import time
import io
from tensorflow import keras 

In [7]:
#Download data

path_zip = keras.utils.get_file('spa-eng.zip' , 
                                origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
                                extract = True
)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [12]:
path_to_file = os.path.dirname(path_zip)+"/spa-eng/spa.txt"
print(path_to_file)

C:\Users\RKumarX0105498\.keras\datasets/spa-eng/spa.txt


In [33]:
# Converts the unicode file to ascii
def convert_unicode_file(s):
#     return "".join(c for c in unicodedata.normalize('NED', s) 
#                    if unicodedata.category(c) != "Mn")

    return "".join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != "Mn")

In [34]:
def preprocess_sentence(w):
    w = convert_unicode_file(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stack w = re.sub(r"([?.!,¿])", r" \1 ", w)
#     overflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
     # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w
        
    
    

In [35]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [47]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]

def create_dataset(path, no_example):
    lines = io.open(path, encoding="UTF-8").read().strip().split("\n")
    
    word_pair = [    [  preprocess_sentence(w) for w in l.split('\t')  ]       for l in lines[:no_example]       ]
    
    return zip(*word_pair)

In [48]:
en, sp = create_dataset(path_to_file, None)

In [51]:
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [52]:
def max_tensor(tensor):
    return max(len(m)  for m in tensor )

In [61]:
def tokenizer(lang):
    long_tokenizer = keras.preprocessing.text.Tokenizer(filters="")
    long_tokenizer.fit_on_texts(lang)
    tensor = long_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding="post")
    return tensor, long_tokenizer
    

In [62]:
def load_dataset(path, no_example = None):
    #create clean input, output pair
    tar_lang, inp_lang = create_dataset(path, no_example)
    inp_tensor, inp_long_tokenizer = tokenizer(inp_lang)
    tar_tensor, tar_long_tokenizer = tokenizer(tar_lang)
    
    return inp_tensor, tar_tensor, inp_long_tokenizer, tar_long_tokenizer

In [68]:
# Try experimenting with the size of that dataset

no_example = 30000
inp_tensor, tar_tensor, inp_long_tokenizer, tar_long_tokenizer = load_dataset(path_to_file , no_example)

#calculate maximum length of target tensor
max_len_tar, max_len_inp = max_tensor(tar_tensor) , max_tensor(inp_tensor)


print(inp_tensor)
print(tar_tensor)
print(inp_long_tokenizer)
print(tar_long_tokenizer)

[[   1  135    3 ...    0    0    0]
 [   1  293    3 ...    0    0    0]
 [   1  595    3 ...    0    0    0]
 ...
 [   1   18 9413 ...    0    0    0]
 [   1   63 2490 ...    0    0    0]
 [   1   23 2175 ...    0    0    0]]
[[ 1 36  3 ...  0  0  0]
 [ 1 36  3 ...  0  0  0]
 [ 1 36  3 ...  0  0  0]
 ...
 [ 1 16 38 ...  0  0  0]
 [ 1 16 38 ...  0  0  0]
 [ 1 16 38 ...  0  0  0]]
<keras_preprocessing.text.Tokenizer object at 0x00000221B8B25208>
<keras_preprocessing.text.Tokenizer object at 0x00000221BF129E10>


In [66]:
print(max_len_tar)
print(max_len_inp)

11
16


In [71]:
# Creating training and validation sets using an 80-20 split
# input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(inp_tensor, tar_tensor, test_size=0.2)



x_train, y_train, x_test, y_test = train_test_split(inp_tensor, tar_tensor, test_size=0.2)


# Show length
# print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))
print(len(x_train), len(x_test), len(y_train), len(y_test))



24000 24000 6000 6000
