

# NMT using attention
## NOTICE : Attention layer is not yet implemented in keras library , which makes implementation of attention mechanism a lot difficult than other neural network models
## We can't use keras's model class and do model.fit , instead we'll have to write optimizer, loss function, model layers and training code from scratch and combine them to create a workflow. This is equivalent to writing model.fit in normal neural network models.

In [1]:

import tensorflow as tf
tf.enable_eager_execution()
from __future__ import absolute_import, division, print_function, unicode_literals


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import string

## Download and process dataset

In [2]:
# download dataset
!wget 'http://www.manythings.org/anki/hin-eng.zip'


--2020-03-14 08:25:01--  http://www.manythings.org/anki/hin-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:3037::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 126500 (124K) [application/zip]
Saving to: ‘hin-eng.zip.1’


2020-03-14 08:25:01 (1.08 MB/s) - ‘hin-eng.zip.1’ saved [126500/126500]



In [3]:
!ls
!unzip 'hin-eng.zip'


hin-eng.zip  hin-eng.zip.1  sample_data
Archive:  hin-eng.zip
  inflating: hin.txt                 
  inflating: _about.txt              


In [0]:
lines = io.open('hin.txt', encoding='UTF-8').read().strip().split('\n')
path_to_file = 'hin.txt'

In [5]:
print(lines)

['Wow!\tवाह!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #6179147 (fastrizwaan)', 'Help!\tबचाओ!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #459377 (minshirui)', 'Jump.\tउछलो.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #6179121 (fastrizwaan)', 'Jump.\tकूदो.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #6179122 (fastrizwaan)', 'Jump.\tछलांग.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #6179123 (fastrizwaan)', 'Hello!\tनमस्ते।\tCC-BY 2.0 (France) Attribution: tatoeba.org #373330 (CK) & #480306 (minshirui)', 'Hello!\tनमस्कार।\tCC-BY 2.0 (France) Attribution: tatoeba.org #373330 (CK) & #480307 (minshirui)', 'Cheers!\tवाह-वाह!\tCC-BY 2.0 (France) Attribution: tatoeba.org #487006 (human600) & #6179113 (fastrizwaan)', 'Cheers!\tचियर्स!\tCC-BY 2.0 (France) Attribution: tatoeba.org #487006 (human600) & #6179114 (fastrizwaan)', 'Got it?\tसमझे कि नहीं?\tCC-BY 2.0 (France) Attribution: tatoeba.or

In [0]:
eng_sent = []
hin_sent = []
for line in lines:
  splits = line.split('\t')
  #removing extra stuffs 
  eng = splits[0]
  hin = splits[1]
  # remove punctutation and make lower case
  eng = eng.translate(str.maketrans('', '', string.punctuation))
  eng = eng.lower()
  hin = hin.translate(str.maketrans('', '', string.punctuation))
  eng = '<start> ' + eng + ' <end>'
  hin = '<start> ' + hin + ' <end>'
  eng_sent.append(eng)
  hin_sent.append(hin)

In [10]:
print(eng_sent,hin_sent)
print(len(eng_sent))

['<start> wow <end>', '<start> help <end>', '<start> jump <end>', '<start> jump <end>', '<start> jump <end>', '<start> hello <end>', '<start> hello <end>', '<start> cheers <end>', '<start> cheers <end>', '<start> got it <end>', '<start> im ok <end>', '<start> awesome <end>', '<start> come in <end>', '<start> get out <end>', '<start> go away <end>', '<start> goodbye <end>', '<start> perfect <end>', '<start> perfect <end>', '<start> welcome <end>', '<start> welcome <end>', '<start> have fun <end>', '<start> have fun <end>', '<start> have fun <end>', '<start> i forgot <end>', '<start> i forgot <end>', '<start> ill pay <end>', '<start> im fine <end>', '<start> im full <end>', '<start> lets go <end>', '<start> answer me <end>', '<start> birds fly <end>', '<start> excuse me <end>', '<start> fantastic <end>', '<start> i fainted <end>', '<start> i fear so <end>', '<start> i laughed <end>', '<start> im bored <end>', '<start> im broke <end>', '<start> im tired <end>', '<start> its cold <end>', '

# code to encode dataset and create dictionary using keras tokenizer

In [0]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [0]:
def tokenize(lang):
    # default filter value is '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    # an instance of tokenizer
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    
    #takes all sentences in lang and makes dictionary for it
    lang_tokenizer.fit_on_texts(lang)
    
    #encodes the text -> (text ->vector)
    tensor = lang_tokenizer.texts_to_sequences(lang)

    #puts appropriate number of zeros after the sent
    # if maxlen of sentences if 100 and any given sentence is of len 20, then
    # it'll pad 80 zeros at the end
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    # tensor is a vector of n*maxlen
    # and lang_tokenizer is a dictionary mapping word with key 
    return tensor, lang_tokenizer

In [0]:
def load_dataset(inp_lang,targ_lang):

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

# encoding datatset and creating dictionary

In [0]:
# Try experimenting with the size of that dataset
# num_examples = 20000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(eng_sent,hin_sent)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [16]:
print(input_tensor.shape,target_tensor.shape)
print("max length of input and output are : ", max_length_inp,max_length_targ )
print('english dictionary is : ')
print(inp_lang.word_index)
print("hindi dictionary is : ")
print(targ_lang.word_index)

(2779, 24) (2779, 27)
max length of input and output are :  24 27
english dictionary is : 
{'<start>': 1, '<end>': 2, 'the': 3, 'i': 4, 'to': 5, 'you': 6, 'is': 7, 'a': 8, 'he': 9, 'of': 10, 'in': 11, 'it': 12, 'my': 13, 'me': 14, 'this': 15, 'have': 16, 'she': 17, 'was': 18, 'for': 19, 'are': 20, 'do': 21, 'that': 22, 'his': 23, 'your': 24, 'will': 25, 'what': 26, 'on': 27, 'we': 28, 'dont': 29, 'at': 30, 'him': 31, 'her': 32, 'not': 33, 'like': 34, 'go': 35, 'with': 36, 'be': 37, 'how': 38, 'and': 39, 'can': 40, 'im': 41, 'has': 42, 'there': 43, 'time': 44, 'know': 45, 'all': 46, 'up': 47, 'they': 48, 'come': 49, 'very': 50, 'as': 51, 'please': 52, 'had': 53, 'from': 54, 'its': 55, 'by': 56, 'out': 57, 'want': 58, 'when': 59, 'am': 60, 'were': 61, 'did': 62, 'here': 63, 'no': 64, 'been': 65, 'cant': 66, 'going': 67, 'get': 68, 'take': 69, 'an': 70, 'father': 71, 'ill': 72, 'book': 73, 'about': 74, 'if': 75, 'one': 76, 'money': 77, 'india': 78, 'would': 79, 'tom': 80, 'long': 81, 'tod

In [19]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2501 2501 278 278


In [0]:
# #This is what our model is going to learn
# def convert(lang, tensor):
#     for t in tensor:
#         if t!=0:
#             print ("%d ----> %s" % (t, lang.index_word[t]))

# print ("Input Language; index to word mapping")
# convert(inp_lang, input_tensor_train[0])
# print ()
# print ("Target Language; index to word mapping")
# convert(targ_lang, target_tensor_train[0])

In [0]:
BUFFER_SIZE = len(input_tensor_train) #training set size
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

#Creates a Dataset whose elements are slices of the given tensors.
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

#Combines consecutive elements of this dataset into batches.
#eg dataset = tf.data.Dataset.range(8) 
# dataset = dataset.batch(3) 
# list(dataset.as_numpy_iterator()) 
# [ array([0,1,2]), array([3,4,5]) , array([5,6,7])]

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [22]:
# so dataset loads 64 examples at a time for both input and output
# its a pointer to point to next batch when needed.
print(dataset)


<DatasetV1Adapter shapes: ((64, 24), (64, 27)), types: (tf.int32, tf.int32)>


In [23]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([Dimension(64), Dimension(24)]),
 TensorShape([Dimension(64), Dimension(27)]))

In [24]:
print(example_input_batch)

tf.Tensor(
[[  1  83 107 ...   0   0   0]
 [  1  21   6 ...   0   0   0]
 [  1  48  61 ...   0   0   0]
 ...
 [  1  55  19 ...   0   0   0]
 [  1  17 958 ...   0   0   0]
 [  1  13 212 ...   0   0   0]], shape=(64, 24), dtype=int32)


# Making the model


**return_sequences** -> give hidden state for each time step  
**return_state** -> gives (in case of lstm) [hidden state, hidden state, cell state for last time step].  
**both of them** -> (in case of lstm)[hidden state for all time step, hidden state for last time step, cell state for last time step]  
(in case of gru)[hidden state for all time steps, hidden state for final timestep]

In [0]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        #takes input and returns h and y
        x = self.embedding(x)
        output, state ,cell = self.lstm(x, initial_state = hidden)
        encoder_states = [state,cell]
        return output, encoder_states

    def initialize_hidden_state(self):
        state =  tf.zeros((self.batch_sz, self.enc_units))
        return [state,state]

In [26]:
print(vocab_inp_size)
print(embedding_dim,units,BATCH_SIZE)

2373
256 1024 64


In [29]:
# encoder is a class instance having 
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# not sure how this line is working..
sample_hidden = encoder.initialize_hidden_state()

sample_output, encoder_states = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden[0].shape))


Encoder output shape: (batch size, sequence length, units) (64, 24, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [31]:

print("shape of sample_hidden",sample_hidden[0].get_shape())

shape of sample_hidden (64, 1024)


# ATTENTION MECHANISAM  CODE


In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        
        query = query[0]
        # query_with_time_axis shape == (batch_size, 1, hidden size) 
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [36]:
# this is just testing if attention is working or not
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 24, 1)


In [37]:
print(sample_hidden)

[<tf.Tensor: id=2200, shape=(64, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: id=2200, shape=(64, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>]


## DECODER

In [0]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # enc_output is the list of all the hidden states
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU

        output, hidden, cell = self.lstm(x)
        decoder_states = [hidden,cell]
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, decoder_states, attention_weights

### state is hidden state of gru for that time step
### output and state are the same thing here , as this is only for one time step
### output is :  Tensor("decoder_2/gru_5/transpose_1:0", shape=(32, 1, 1024), dtype=float32)
### state is :  Tensor("decoder_2/gru_5/while/Exit_3:0", shape=(32, 1024), dtype=float32)
notice the difference in shapes

### testing if decoder class is working


In [39]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 2991)


# OPTIMIZER

In [0]:
optimizer = tf.keras.optimizers.Adam()

'''
In the snippet below, there is a single floating point value per example for
`y_true` and `# classes` floating pointing values per example for `y_pred`.
The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
`[batch_size, num_classes]`.
'''
#Computes the crossentropy loss between the labels and predictions.
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

# only considering cross entropy for incorrectly classified samples.
def loss_function(real, pred):
    # tf.math.logical - Returns the truth value of NOT x element-wise.
    # tf.math.equal - Returns the truth value of (x == y) element-wise.
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    # loss as defined in the image above
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [42]:
for (batch, (inp, targ)) in enumerate( dataset.take(steps_per_epoch) ):
    print(targ)
    print(targ.shape)
    break

tf.Tensor(
[[   1   10  711 ...    0    0    0]
 [   1   22  152 ...    0    0    0]
 [   1  408  161 ...    0    0    0]
 ...
 [   1   24   28 ...    0    0    0]
 [   1    7   36 ...    0    0    0]
 [   1   35 2132 ...    0    0    0]], shape=(64, 27), dtype=int32)
(64, 27)


## Training

1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.
2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
    '''
    inp is encoded input sentence 
    targ is encoded output sentence(batchsize,len_of_target_word)
    enc_hidden is zero vector of shape (batchsize,units)
    '''
    loss = 0

    with tf.GradientTape() as tape:
        #recoder or tracks variable values
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden
        
        # targ_lang.word_index['<start>'] - returns dictionary value of start token
        # dec_input is batch_size
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            #enc_output is list of all hidden states of encoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            # this is simply the loss between t th predicted word and its prediction
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            # expand_dims is to change dimention such that it can act as timesteps
            dec_input = tf.expand_dims(targ[:, t], 1)
    
    # explanationn of this part https://www.tensorflow.org/guide/effective_tf2
    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

## operation code 

In [44]:
EPOCHS = 60

for epoch in range(EPOCHS):
    start = time.time()

    # returns a zero vector of shape(batch_size,units)
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    # dataset is tf.data.Dataset() object.
    for (batch, (inp, targ)) in enumerate( dataset.take(steps_per_epoch) ):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1 Loss 1.8750
Time taken for 1 epoch 96.70767164230347 sec

Epoch 2 Loss 1.6766
Time taken for 1 epoch 15.303346633911133 sec

Epoch 3 Loss 1.5766
Time taken for 1 epoch 14.641101360321045 sec

Epoch 4 Loss 1.5041
Time taken for 1 epoch 15.266671180725098 sec

Epoch 5 Loss 1.4542
Time taken for 1 epoch 14.642654418945312 sec

Epoch 6 Loss 1.4022
Time taken for 1 epoch 15.268167495727539 sec

Epoch 7 Loss 1.3385
Time taken for 1 epoch 14.651453733444214 sec

Epoch 8 Loss 1.2546
Time taken for 1 epoch 15.274116277694702 sec

Epoch 9 Loss 1.1677
Time taken for 1 epoch 14.643583536148071 sec

Epoch 10 Loss 1.1079
Time taken for 1 epoch 15.231248378753662 sec

Epoch 11 Loss 1.0508
Time taken for 1 epoch 14.651736974716187 sec

Epoch 12 Loss 0.9856
Time taken for 1 epoch 15.308237552642822 sec

Epoch 13 Loss 0.9273
Time taken for 1 epoch 14.68724775314331 sec

Epoch 14 Loss 0.8710
Time taken f

## Evaluator
The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.

In [0]:
def preprocess_sentence(eng):
  eng = eng.translate(str.maketrans('', '', string.punctuation))
  eng = eng.lower()
  eng = '<start> ' + eng + ' <end>'
  return eng

In [0]:
def evaluate(sentence):
  # this is for the heat map
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  #convert input to its embeddings
  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_inp,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)

  # we'll keep appending the predicted word in this
  result = ''

  # got all the hidden states and last encoder hidden state
  hidden = [tf.zeros((1, units)),tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  # for first timestep decoder hidden state = encoder hidden state
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  # keep feeding words in decoder for no of words in target sentence
  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                          dec_hidden,
                                                          enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    # max value from probability of all words in target sentence
    predicted_id = tf.argmax(predictions[0]).numpy()
    # print(predictions.shape,predictions)
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [0]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [0]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

# Restore the latest checkpoint and test

In [61]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fae7a7f47f0>

## checking

In [0]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [73]:
translate('I will get back to you')


Input: <start> i will get back to you <end>
Predicted translation: मैं तुम्हारे साथ आऊँगा। <end> 


In [74]:
translate('This cat is nice')


Input: <start> this cat is nice <end>
Predicted translation: यह लिफ़्ट छटी मंज़िल तक ही है। <end> 


In [66]:
translate(u'i will not come')


Input: <start> i will not come <end>
Predicted translation: मैं तुम्हारे साथ आऊँगा। <end> 


In [67]:
translate(u'men are playing')


Input: <start> men are playing <end>
Predicted translation: वे हमारे साथ आएगा। <end> 


In [68]:
translate(u'school is closed')


Input: <start> school is closed <end>
Predicted translation: स्कूल अप्रैल में बोलिए। <end> 


In [69]:
translate(u'why does this happen to me')


Input: <start> why does this happen to me <end>
Predicted translation: तुम क्यों नहीं हो <end> 


In [72]:
translate('this is not working')


Input: <start> this is not working <end>
Predicted translation: यह भी यकीन है। <end> 
