<a href="https://colab.research.google.com/github/Nvillaluenga/Neural-Translation-with-attention/blob/NachDev/Neural_Translation_with_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from __future__ import absolute_import, division, print_function
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import re
import os
import time

print(tf.__version__) # as TF is v=2.3.0 eager execution is enabled by default

2.3.0


In [11]:
# Get our data
path_to_zip = tf.keras.utils.get_file(
    fname='spa-eng.zip',
    origin='http://download.tensorflow.org/data/spa-eng.zip',
    extract=True)
print(path_to_zip)

path_to_file = os.path.dirname(path_to_zip)+'/spa-eng/spa.txt'
print(path_to_file)

/root/.keras/datasets/spa-eng.zip
/root/.keras/datasets/spa-eng/spa.txt


In [16]:
# Preprocessing functions

def unicode_to_ascii(string):
  return ''.join( c for c in unicodedata.normalize('NFD', string)
    if unicodedata.category(c) != 'Mn' )
  
def preprocess_sentence(string):
  string = unicode_to_ascii(string.lower().strip())
  # separate words from simbols (?.!,¿), eliminate double (or more) spaces
  # and replace every weird stuff non english character with space
  string = re.sub(r'([?.!,¿])', r' \1', string)
  string = re.sub(r'[" "]+', ' ', string)
  string = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', string)
  string = string.rstrip().strip()
  # adding a start and an end token to the sentence
  string = '<start> ' + string + ' <end>'
  return string

def create_dataset(path, num_examples):
  lines = open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [ [ preprocess_sentence(word) for word in line.split('\t') ]
                for line in lines[:num_examples] ]
  return word_pairs

# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()

    self.create_index()
  
  def create_indec():
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    
    self.vocab = sorted(self.vocab)

    self.word2idx['<pad>'] = 0
    self.idx2word[0] = '<pad>'
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index+1
      self.idx2word[index+1] = word

def max_length(tensor):
  return max(len(t) for t in tensor)

def load_dataset(path, num_examples):
  # Create input output pairs
  pairs = create_dataset(path, num_examples)

  inp_lang = LanguageIndex(sp for en, sp in pairs)
  target_lang = LanguageIndex(en for en, sp in pairs)

  # Vectorize all this
  input_tensor = [ [ inp_lang.word2idx[word] for word in sp.split(' ') ]
                  for en, sp in pairs ]
  target_tensor = [ [ target_lang.word2idx[word] for word in en.split(' ') ]
                  for en, sp in pairs ]

  # Padding input and output
  max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
  input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
      sequences = input_tensor,
      maxlen = max_length_inp,
      padding = 'post'
  )
  target_tensor = tf.keras.preprocessing.sequence.pad_sequences(
      sequences = target_tensor,
      maxlen = max_length_tar,
      padding = 'post'
  )

  return input_tensor, target_tensor, inp_lang, tar_lang, max_length_inp, max_length_tar

In [19]:
#Limit the size of the dataset to experiment faster (optional)

...