# NMT using attention
## NOTICE : Attention layer is not yet implemented in keras library , which makes implementation of attention mechanism a lot difficult than other neural network models
## We can't use keras's model class and do model.fit , instead we'll have to write optimizer, loss function, model layers and training code from scratch and combine them to create a workflow. This is equivalent to writing model.fit in normal neural network models.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [3]:
print(path_to_file)

/home/saharsh/.keras/datasets/spa-eng/spa.txt


In [4]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

In [5]:
def preprocess_sentence(w):
    
#     print("1 -> ",w)
    w = unicode_to_ascii(w.lower().strip())
    
#     print("2-> ",w)

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    
#     print("3 -> ",w)
    w = re.sub(r'[" "]+', " ", w)
    
#     print("4 -> ",w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
#     print("5 -> ",w)
    # rstrip() Remove spaces to the right of the string:
    # strip() removes all leading and tailing whitespaces"
    w = w.rstrip().strip()
    
#     print("6 -> ",w)
    # adding a start and an end token to the sentence
    # so that the m"odel know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [6]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
spanish = (preprocess_sentence(sp_sentence))
print(spanish.encode('utf-8'))
print(spanish)

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'
<start> ¿ puedo tomar prestado este libro ? <end>


In [7]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    
    # io.open() Open file and return a corresponding stream.
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    
    # w = [eng_sent,spanish_sent]
    # l = [['eng \t spanish sent'] , ['eng spanish sent']]
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    return zip(*word_pairs)

In [8]:
en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [9]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [10]:
def tokenize(lang):
    # default filter value is '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    # an instance of tokenizer
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    
    #takes all sentences in lang and makes dictionary for it
    lang_tokenizer.fit_on_texts(lang)
    
    #encodes the text -> (text ->vector)
    tensor = lang_tokenizer.texts_to_sequences(lang)

    #puts appropriate number of zeros after the sent
    # if maxlen of sentences if 100 and any given sentence is of len 20, then
    # it'll pad 80 zeros at the end
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    # tensor is a vector of n*maxlen
    # and lang_tokenizer is a dictionary mapping word with key 
    return tensor, lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    #Return word pairs in the format: [ENGLISH, SPANISH]
    targ_lang, inp_lang = create_dataset(path, num_examples)

    #encoded sentence and dictionary for the same, dictionary will be used in embedding layer
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

## Limiting data

In [12]:
# Try experimenting with the size of that dataset
num_examples = 20000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [13]:
print(max_length_inp,max_length_targ)

16 10


In [14]:
print("input tensor shape : ",input_tensor.shape)
print("target tensor shape : ",target_tensor.shape)
sent = en[1011]
print(sent,input_tensor[1011])

input tensor shape :  (20000, 16)
target tensor shape :  (20000, 10)
<start> he s broke . <end> [   1   13  229 1276    3    2    0    0    0    0    0    0    0    0
    0    0]


In [15]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

16000 16000 4000 4000


In [16]:
#This is what our model is going to learn
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
5 ----> ¿
26 ----> por
14 ----> que
137 ----> hace
555 ----> tanto
258 ----> calor
4 ----> ?
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
83 ----> why
8 ----> is
9 ----> it
81 ----> so
201 ----> hot
6 ----> ?
2 ----> <end>


In [17]:
BUFFER_SIZE = len(input_tensor_train) #training set size
BATCH_SIZE = 32
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

#Creates a Dataset whose elements are slices of the given tensors.
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

#Combines consecutive elements of this dataset into batches.
#eg dataset = tf.data.Dataset.range(8) 
# dataset = dataset.batch(3) 
# list(dataset.as_numpy_iterator()) 
# [ array([0,1,2]), array([3,4,5]) , array([5,6,7])]

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
# so dataset loads 32 examples at a time for both input and output
# its a pointer to point to next batch when needed.
print(dataset)


<DatasetV1Adapter shapes: ((32, 16), (32, 10)), types: (tf.int32, tf.int32)>


In [19]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([Dimension(32), Dimension(16)]),
 TensorShape([Dimension(32), Dimension(10)]))

In [20]:
print(example_input_batch)

Tensor("IteratorGetNext:0", shape=(32, 16), dtype=int32)


## Making the model

![first_image](1.jpg)
![second_image](2.jpg)

## pseudo code
> score = FC(tanh(FC(EO) + FC(H)))  
> attention weights = softmax(score, axis = 1)  
> context vector = sum(attention weights * EO, axis = 1)  
> embedding output = The input to the decoder X is passed through an embedding layer.  
> merged vector = concat(embedding output, context vector)  

**return_sequences** -> give hidden state for each time step  
**return_state** -> gives (in case of lstm) [hidden state, hidden state, cell state for last time step].  
**both of them** -> (in case of lstm)[hidden state for all time step, hidden state for last time step, cell state for last time step]  
(in case of gru)[hidden state for all time steps, hidden state for final timestep]

In [21]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        #takes input and returns h and y
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [22]:
print(vocab_inp_size)
print(embedding_dim,units,BATCH_SIZE)

7177
256 1024 32


### sample output is hidden state of encoder for all timesteps that is {h1,h2,h3...h16}
### sample_hidden is output or hidden state for last time step.

In [27]:
# encoder is a class instance having 
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# not sure how this line is working..
sample_hidden = encoder.initialize_hidden_state()

sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))


Encoder output shape: (batch size, sequence length, units) (32, 16, 1024)
Encoder Hidden state shape: (batch size, units) (32, 1024)


In [28]:
print("shape of sample_hidden",sample_hidden.get_shape())

shape of sample_hidden (32, 1024)


## ATTENTION
### the attention weights for each example is different and to calculate that we use a feed forward neural network to get attention weights for each word in the sentence.
### here attention weights will be of the shape  (n,1,10)  
n is number of training examples  
10 is length of each sentence after being encoded

### hidden size = units = 1024
### value is actually {h1,h2,h3...h16}  : shape == (batch_size, max_len, hidden size)

### query is last hidden state of decoder that is h(t-1) 
### hi is hidden state , a vector of 1024 elements.


In [30]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        

        # query_with_time_axis shape == (batch_size, 1, hidden size) 
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [31]:
# this is just testing if attention is working or not
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (32, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (32, 16, 1)


In [32]:
print(sample_hidden)

Tensor("encoder_2/gru_2/while/Exit_3:0", shape=(32, 1024), dtype=float32)


## decoder class
all the class variables are as defined above

In [47]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # enc_output is the list of all the hidden states
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU

        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

### state is hidden state of gru for that time step
### output and state are the same thing here , as this is only for one time step
### output is :  Tensor("decoder_2/gru_5/transpose_1:0", shape=(32, 1, 1024), dtype=float32)
### state is :  Tensor("decoder_2/gru_5/while/Exit_3:0", shape=(32, 1024), dtype=float32)
notice the difference in shapes

### testing if decoder class is working


In [48]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (32, 3728)


### as we cannot use the model funtion from keras here as mentioned above we'll have to write and combine loss function , optimizer and model.fit manually.

![third_image](3.png)


In [None]:
optimizer = tf.keras.optimizers.Adam()

#Computes the crossentropy loss between the labels and predictions.
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    # tf.math.logical - Returns the truth value of NOT x element-wise.
    # tf.math.equal - Returns the truth value of (x == y) element-wise.
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    # loss as defined in the image above
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)