In [0]:
#Importing bunch of libraries and neural layers.
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Embedding,GRU,Dense
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import os
from tensorflow.keras.optimizers import Adam
import time
import unicodedata
import re


TensorFlow 2.x selected.


In [0]:
!wget http://www.manythings.org/anki/spa-eng.zip

--2019-12-30 21:10:32--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:30::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4752884 (4.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2019-12-30 21:10:38 (12.2 MB/s) - ‘spa-eng.zip’ saved [4752884/4752884]



In [0]:
!unzip spa-eng

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [0]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

 
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.rstrip().strip()
  w = '<start> ' + w + ' <end>'
  return w



In [0]:
#Load the dataset
spa_sents=[]
eng_sents=[]


f=open('spa.txt','rt',encoding='utf-8')
allsents=f.read()
f.close()
#Seperate the sents and form language pair.
sents=allsents.strip().split('\n')
pair=[pr.split('\t') for pr in sents]

tempdf=pd.DataFrame(pair)
lang_pairs=tempdf.iloc[:,:2].values


#Removing punctuation.
lang_pairs[:,0] = [(s.translate(str.maketrans('', '', string.punctuation))).lower() for s in lang_pairs[:,0]]
lang_pairs[:,1] = [(s.translate(str.maketrans('', '', string.punctuation))).lower() for s in lang_pairs[:,1]]

eng_lang_words=lang_pairs[:,0].copy()
spa_lang_words=lang_pairs[:,1].copy()

eng_lang_words_t=[]
spa_lang_words_t=[]


# for i in eng_lang_words:
#   eng_lang_words_t.append('<start> ' + i + ' <end>')
# for i in spa_lang_words:
#   spa_lang_words_t.append('<start> ' + i + ' <end>')

for i in range(len(eng_lang_words)):
  eng_lang_words_t.append(preprocess_sentence(eng_lang_words[i]))

for i in range(len(spa_lang_words)):
  spa_lang_words_t.append(preprocess_sentence(spa_lang_words[i]))

eng_lang_words_t=np.array(eng_lang_words_t)
spa_lang_words_t=np.array(spa_lang_words_t)









    

    


    
    



In [0]:
eng_lang_words_t[200]

'<start> push it <end>'

In [0]:
#Creating dataframe for creating language pairs.
li=[eng_lang_words_t,spa_lang_words_t]
dataframe=pd.DataFrame(li)
dataframe_t=dataframe.T
new_lang_pairs=dataframe_t.iloc[:,:].values
new_lang_pairs=new_lang_pairs[:100000]

In [0]:
#Defining the tokenization funcntion and padding function.
def tokenization(lines):
      tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
      tokenizer.fit_on_texts(lines)
      return tokenizer
def encode_sequences(tokenizer, length, lines):
         seq = tokenizer.texts_to_sequences(lines)
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

In [0]:
#Finding the max length of the sentences in the given language.
eng_length=max([len(list(sen)) for sen in new_lang_pairs[:,0]])
spa_length=max([len(list(sen)) for sen in new_lang_pairs[:,1]])
print(eng_length)
print(spa_length)

55
87


In [0]:
#Spilliting the data into training and test set.
from sklearn.model_selection import train_test_split

train, test = train_test_split(new_lang_pairs, test_size=0.2, random_state = 12)

In [0]:
#Tokenize and encode the sentences.

# prepare english tokenizer
eng_tokenizer = tokenization(new_lang_pairs[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

# prepare spasian tokenizer
spa_tokenizer = tokenization(new_lang_pairs[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1

# prepare training data
trainX = encode_sequences(spa_tokenizer, spa_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

# prepare validation data
testX = encode_sequences(spa_tokenizer, spa_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])



In [0]:
#Define the layers of the custom Seq to Seq model with Attention
class Encoder(tf.keras.models.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
        super(Encoder,self).__init__()
        self.batch_size=batch_size
        self.enc_units=enc_units
        self.embedding=Embedding(vocab_size,embedding_dim)
        self.gru=GRU(self.enc_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') 
                                                                                                                      
    def call(self,x,hidden):
        x=self.embedding(x) #Input is feed to embedding which returns word vectors.
        output,state=self.gru(x,initial_state=hidden)
        return output,state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))

#Attention layer
class Attention_layer(tf.keras.layers.Layer):
    def __init__(self,units):
        super(Attention_layer,self).__init__()
        self.W1=Dense(units)
        self.W2=Dense(units)
        self.V=Dense(1)
        
    def call(self,query,values):
        hidden_with_time_axis=tf.expand_dims(query,1)
        score=self.V(tf.nn.tanh(self.W1(values)+self.W2(hidden_with_time_axis)))
        attention_weights=tf.nn.softmax(score,axis=1)
        context_vector=attention_weights*values
        context_vector=tf.reduce_sum(context_vector,axis=1)
        return context_vector,attention_weights

#Decoder.
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_size):
        super(Decoder,self).__init__()
        self.batch_size=batch_size
        self.dec_units=dec_units
        self.embedding=Embedding(vocab_size,embedding_dim)
        self.gru=GRU(self.dec_units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform')
        self.fc=Dense(vocab_size)
        self.attention=Attention_layer(self.dec_units)
        
    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        x=self.embedding(x)
        x=tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output,state=self.gru(x)
        output=tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights
    
    
    
   

In [0]:
#Defining the optimizers and the loss function.
optimizer=Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

#Creating objects of encoder,Attention layer and decoder.
encoder=Encoder(spa_vocab_size,256,1024,64)
decoder=Decoder(eng_vocab_size,256,1024,64)

@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']] * 64, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss
    
    
        
        

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((trainX, trainY)).shuffle(len(trainX))
dataset = dataset.batch(64, drop_remainder=True)

In [25]:
EPOCHS = 10


for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(len(trainX)//64)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / len(trainX)//64))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.0717
Epoch 1 Batch 100 Loss 0.5924
Epoch 1 Batch 200 Loss 0.5974
Epoch 1 Batch 300 Loss 0.5464
Epoch 1 Batch 400 Loss 0.5377
Epoch 1 Batch 500 Loss 0.4543
Epoch 1 Batch 600 Loss 0.4544
Epoch 1 Batch 700 Loss 0.4883
Epoch 1 Batch 800 Loss 0.4044
Epoch 1 Batch 900 Loss 0.4299
Epoch 1 Batch 1000 Loss 0.3789
Epoch 1 Batch 1100 Loss 0.3504
Epoch 1 Batch 1200 Loss 0.3803
Epoch 1 Loss 0.0000
Time taken for 1 epoch 785.7883410453796 sec

Epoch 2 Batch 0 Loss 0.3419
Epoch 2 Batch 100 Loss 0.3130
Epoch 2 Batch 200 Loss 0.3348
Epoch 2 Batch 300 Loss 0.2907
Epoch 2 Batch 400 Loss 0.2427
Epoch 2 Batch 500 Loss 0.2689
Epoch 2 Batch 600 Loss 0.2204
Epoch 2 Batch 700 Loss 0.2426
Epoch 2 Batch 800 Loss 0.2406
Epoch 2 Batch 900 Loss 0.2425
Epoch 2 Batch 1000 Loss 0.2096
Epoch 2 Batch 1100 Loss 0.1911
Epoch 2 Batch 1200 Loss 0.1936
Epoch 2 Loss 0.0000
Time taken for 1 epoch 737.3531439304352 sec

Epoch 3 Batch 0 Loss 0.1684
Epoch 3 Batch 100 Loss 0.1649
Epoch 3 Batch 200 Loss 0.181

In [0]:
def evaluate(sentence):

  sentence=preprocess_sentence(sentence)

  inputs = [spa_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=spa_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, 1024))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']], 0)

  for t in range(spa_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

   
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += eng_tokenizer.index_word[predicted_id] + ' '

    if eng_tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence

   
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

def translate(sentence):
   result, sentence = evaluate(sentence)

   print('Input: %s' % (sentence))
   print('Predicted translation: {}'.format(result))


In [27]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f755c5c90b8>

In [28]:
translate(u'él es mi amigo')

Input: <start> el es mi amigo <end>
Predicted translation: hes my friend <end> 


In [29]:
translate(u'ella es mi novia')

Input: <start> ella es mi novia <end>
Predicted translation: shes my girlfriend <end> 


In [32]:
translate(u'Le encanta tocar la guitarra')

Input: <start> le encanta tocar la guitarra <end>
Predicted translation: he loves playing playing the guitar <end> 


In [37]:
translate(u'Esto es muy difícil de hacer')


Input: <start> esto es muy dificil de hacer <end>
Predicted translation: this is very difficult to do <end> 


In [38]:
translate(u'Fue tras el ladrón')

Input: <start> fue tras el ladron <end>
Predicted translation: he was after the thief <end> 


In [39]:
translate(u'el ingles es un idioma muy divertido')

Input: <start> el ingles es un idioma muy divertido <end>
Predicted translation: english is a very funny language <end> 


In [30]:
!zip -r /content/file.zip /content/training_checkpoints

  adding: content/training_checkpoints/ (stored 0%)
  adding: content/training_checkpoints/ckpt-2.index (deflated 70%)
  adding: content/training_checkpoints/ckpt-5.data-00000-of-00002 (deflated 16%)
  adding: content/training_checkpoints/ckpt-1.data-00001-of-00002 (deflated 15%)
  adding: content/training_checkpoints/ckpt-2.data-00001-of-00002 (deflated 14%)
  adding: content/training_checkpoints/ckpt-1.index (deflated 70%)
  adding: content/training_checkpoints/ckpt-4.index (deflated 70%)
  adding: content/training_checkpoints/ckpt-3.data-00001-of-00002 (deflated 14%)
  adding: content/training_checkpoints/ckpt-5.data-00001-of-00002 (deflated 14%)
  adding: content/training_checkpoints/ckpt-2.data-00000-of-00002 (deflated 16%)
  adding: content/training_checkpoints/ckpt-5.index (deflated 70%)
  adding: content/training_checkpoints/ckpt-4.data-00000-of-00002 (deflated 16%)
  adding: content/training_checkpoints/checkpoint (deflated 38%)
  adding: content/training_checkpoints/ckpt-3.da

In [0]:
from google.colab import files
files.download('/content/file.zip')