In [1]:
import tensorflow as tf

In [2]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [3]:
import unicodedata
import re
import numpy as np
import os
import io
import time


In [4]:
path_to_file = "spa.txt"

In [5]:
# Converting the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                if unicodedata.category(c)!='Mn')

In [6]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating space between a word and the punctuation following it
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]', " ", w)

    # replacing everything with space except (a-z, A-Z, and other           symbols)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding start and end tokens
    w = '<start> ' + w +' <end>'
    
    return w


In [7]:
en_sentence = u"May I borrow this book?"

In [8]:
print(preprocess_sentence(en_sentence))

<start> may i borrow this book ? <end>


In [9]:
sp_sentence = u"¿Puedo tomar prestado este libro?"

In [10]:
print(preprocess_sentence(sp_sentence))

<start> ¿ puedo tomar prestado este libro ? <end>


In [11]:
# remove accents
# clean sentences
# return word pairs in the format : [english, spanish]

In [12]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]

    return word_pairs


In [13]:
data = create_dataset(path_to_file, None)

In [14]:
len(data)

124547

In [17]:
data[70000]

['<start> tom and mary hugged each other . <end>',
 '<start> tom y mary se abrazaron . <end>',
 '<start> cc by . france attribution tatoeba . org spamster marcelostockle <end>']

In [18]:
data = data[:70001]

In [19]:
en, sp = [], []

In [20]:
for i in range(0, len(data)):

    en.append(data[:][i][0])
    sp.append(data[:][i][1])

In [21]:
len(sp)

70001

In [22]:
len(en)

70001

In [26]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [29]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  data = create_dataset(path, num_examples)

  targ_lang, inp_lang = [],[]
  for i in range(0, len(data)):

    targ_lang.append(data[:][i][0])
    inp_lang.append(data[:][i][1])

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [30]:
num_examples = 30000
input_tensor, target_tensor, inp_lang_tokenizer, targ_kang_tokenizer = load_dataset(path_to_file, num_examples)

In [31]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [32]:
max_length_inp

16

In [33]:
max_length_targ

11

In [34]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size = 0.2)

In [35]:
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [42]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print(f'{t} ------> {lang.index_word[t]}')

In [43]:
convert(inp_lang_tokenizer, input_tensor_train[0])

1 ------> <start>
9 ------> el
16 ------> esta
20 ------> en
14 ------> la
628 ------> cocina
3 ------> .
2 ------> <end>


In [45]:
targ_lang_tokenizer = targ_kang_tokenizer

In [47]:
convert(targ_lang_tokenizer, target_tensor_train[0])

1 ------> <start>
14 ------> he
11 ------> s
36 ------> in
13 ------> the
1891 ------> kitchen
3 ------> .
2 ------> <end>


In [48]:
# Creating tf.data dataset

In [49]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang_tokenizer.word_index) + 1
vocab_targ_size = len(targ_lang_tokenizer.word_index) + 1


In [50]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

In [51]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)