# Neural machine translation

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

## Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/

In [2]:
#!wget http://www.manythings.org/anki/rus-eng.zip

In [3]:
#!mkdir rus-eng
#!unzip rus-eng.zip -d rus-eng/

In [4]:
#!ls /content/rus-eng/ -lah

In [5]:
# Download the file
#path_to_file = "./content/rus-eng/rus.txt"


path_to_file = './data/fra-eng/fra.txt'

In [6]:
def preprocess_sentence(w):
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

In [7]:
preprocess_sentence("I can't go.")

'<start> I can t go . <end>'

In [8]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [9]:
en, ru = create_dataset(path_to_file, None)
print(en[-1])
print(ru[-1])

<start> If someone who doesn t know your background says that you sound like a native speaker , it means they probably noticed something about your speaking that made them realize you weren t a native speaker . In other words , you don t really sound like a native speaker . <end>
<start> Si quelqu un qui ne conna t pas vos ant c dents dit que vous parlez comme un locuteur natif , cela veut dire qu il a probablement remarqu quelque chose propos de votre locution qui lui a fait prendre conscience que vous n tes pas un locuteur natif . En d autres termes , vous ne parlez pas vraiment comme un locuteur natif . <end>


In [10]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

### Limit the size of the dataset to experiment faster (optional)


In [12]:
len(en), len(ru)

(177210, 177210)

In [13]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [14]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
4 ----> je
12 ----> ne
118 ----> fais
8 ----> pas
110 ----> beaucoup
46 ----> tout
10 ----> a
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
23 ----> don
9 ----> t
17 ----> do
41 ----> all
15 ----> that
112 ----> much
3 ----> .
2 ----> <end>


### Create a tf.data dataset

In [17]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 21]), TensorShape([64, 13]))

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=False,
                                       return_state=False,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        state = self.gru(x, initial_state = hidden)
        return state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [21]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        # self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x, initial_state=hidden)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state

In [22]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)



In [23]:
decoder_sample_h.shape

TensorShape([64, 1024])

## Define the optimizer and the loss function

In [24]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [25]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [26]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [27]:
EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6613
Epoch 1 Batch 100 Loss 2.3020
Epoch 1 Batch 200 Loss 2.0763
Epoch 1 Batch 300 Loss 1.7916
Epoch 1 Batch 400 Loss 1.6321
Epoch 1 Batch 500 Loss 1.6474
Epoch 1 Batch 600 Loss 1.4914
Epoch 1 Batch 700 Loss 1.5311
Epoch 1 Batch 800 Loss 1.4582
Epoch 1 Batch 900 Loss 1.3603
Epoch 1 Batch 1000 Loss 1.4814
Epoch 1 Batch 1100 Loss 1.2764
Epoch 1 Batch 1200 Loss 1.2583
Epoch 1 Loss 1.6714
Time taken for 1 epoch 84.20373487472534 sec

Epoch 2 Batch 0 Loss 1.1822
Epoch 2 Batch 100 Loss 1.2525
Epoch 2 Batch 200 Loss 1.1078
Epoch 2 Batch 300 Loss 1.0592
Epoch 2 Batch 400 Loss 0.9330
Epoch 2 Batch 500 Loss 0.9892
Epoch 2 Batch 600 Loss 0.9759
Epoch 2 Batch 700 Loss 1.0574
Epoch 2 Batch 800 Loss 0.9626
Epoch 2 Batch 900 Loss 0.8311
Epoch 2 Batch 1000 Loss 0.8489
Epoch 2 Batch 1100 Loss 0.8466
Epoch 2 Batch 1200 Loss 0.8485
Epoch 2 Loss 0.9861
Time taken for 1 epoch 78.92437314987183 sec

Epoch 3 Batch 0 Loss 0.7643
Epoch 3 Batch 100 Loss 0.7476
Epoch 3 Batch 200 Loss 0.549

Epoch 18 Batch 1100 Loss 0.0770
Epoch 18 Batch 1200 Loss 0.1260
Epoch 18 Loss 0.0662
Time taken for 1 epoch 74.06139516830444 sec

Epoch 19 Batch 0 Loss 0.0292
Epoch 19 Batch 100 Loss 0.0515
Epoch 19 Batch 200 Loss 0.0427
Epoch 19 Batch 300 Loss 0.0544
Epoch 19 Batch 400 Loss 0.0496
Epoch 19 Batch 500 Loss 0.0589
Epoch 19 Batch 600 Loss 0.0446
Epoch 19 Batch 700 Loss 0.0928
Epoch 19 Batch 800 Loss 0.0624
Epoch 19 Batch 900 Loss 0.0632
Epoch 19 Batch 1000 Loss 0.0726
Epoch 19 Batch 1100 Loss 0.0698
Epoch 19 Batch 1200 Loss 0.0561
Epoch 19 Loss 0.0644
Time taken for 1 epoch 70.82915616035461 sec

Epoch 20 Batch 0 Loss 0.0549
Epoch 20 Batch 100 Loss 0.0310
Epoch 20 Batch 200 Loss 0.0530
Epoch 20 Batch 300 Loss 0.0516
Epoch 20 Batch 400 Loss 0.0414
Epoch 20 Batch 500 Loss 0.0484
Epoch 20 Batch 600 Loss 0.0724
Epoch 20 Batch 700 Loss 0.0783
Epoch 20 Batch 800 Loss 0.1115
Epoch 20 Batch 900 Loss 0.0454
Epoch 20 Batch 1000 Loss 0.0792
Epoch 20 Batch 1100 Loss 0.0594
Epoch 20 Batch 1200 Loss 0

Epoch 36 Batch 200 Loss 0.0482
Epoch 36 Batch 300 Loss 0.0762
Epoch 36 Batch 400 Loss 0.0689
Epoch 36 Batch 500 Loss 0.0647
Epoch 36 Batch 600 Loss 0.0803
Epoch 36 Batch 700 Loss 0.0691
Epoch 36 Batch 800 Loss 0.0686
Epoch 36 Batch 900 Loss 0.0748
Epoch 36 Batch 1000 Loss 0.0984
Epoch 36 Batch 1100 Loss 0.0690
Epoch 36 Batch 1200 Loss 0.0731
Epoch 36 Loss 0.0635
Time taken for 1 epoch 74.35802960395813 sec

Epoch 37 Batch 0 Loss 0.0441
Epoch 37 Batch 100 Loss 0.0442
Epoch 37 Batch 200 Loss 0.0382
Epoch 37 Batch 300 Loss 0.0386
Epoch 37 Batch 400 Loss 0.0410
Epoch 37 Batch 500 Loss 0.0533
Epoch 37 Batch 600 Loss 0.0583
Epoch 37 Batch 700 Loss 0.0497
Epoch 37 Batch 800 Loss 0.0556
Epoch 37 Batch 900 Loss 0.0588
Epoch 37 Batch 1000 Loss 0.0452
Epoch 37 Batch 1100 Loss 0.0763
Epoch 37 Batch 1200 Loss 0.0728
Epoch 37 Loss 0.0618
Time taken for 1 epoch 70.7516736984253 sec

Epoch 38 Batch 0 Loss 0.0769
Epoch 38 Batch 100 Loss 0.0535
Epoch 38 Batch 200 Loss 0.0585
Epoch 38 Batch 300 Loss 0.06

## Translate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [28]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)

        # storing the attention weights to plot later on
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [29]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint and test

In [30]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1ad74636e80>

In [64]:
translate('ne jamais dire jamais.')

Input: <start> ne jamais dire jamais . <end>
Predicted translation: never say never . <end> 


In [65]:
translate('où habites tu')

Input: <start> o habites tu <end>
Predicted translation: where do you live ? <end> 


In [66]:
translate('comment venir à paris ?')

Input: <start> comment venir paris ? <end>
Predicted translation: how was your trip ? <end> 


In [67]:
translate(u'essaye de le faire.')

Input: <start> essaye de le faire . <end>
Predicted translation: try to do that . <end> 


In [68]:
translate(u'comment se rendre au magasin')

Input: <start> comment se rendre au magasin <end>
Predicted translation: how i was the movie ? <end> 


In [69]:
translate(u'combien ça coûte?')

Input: <start> combien a co te ? <end>
Predicted translation: how much is it ? <end> 


In [70]:
translate(u"j'aime quand il neige.")

Input: <start> j aime quand il neige . <end>
Predicted translation: i like it when it s snowing . <end> 
