# Neural Machine Translation with Bidirectional Encoder-Decoder LSTMs:

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
dataset = [
    ["I love you", "je t'aime"],
    ["What is your name ?", "Comment tu t'appele ?"],
    ["How old are you", "Quel age as tu"],
    ["why is the world so dumb", "pourquoi le monde est si idiot"],
    ["I found the dog", "j'ai trouve le chien"],
    ["He died", "Il est mort"],
    ["I kissed him", "Je l'ai embrasse"],
    ["New-York is a beautiful city", "New-York est une belle ville"],
    ["what are you doing", "Que ce que tu fait"],
    ["I lost hope", "J'ai perdu l'espoir"],
    ["Donald Trump is the worst president", "Donald Trump est le pire president"],
    ["Do you feel bad", "Tu te sens mauvais"],
    ["I found an apple at my desk", "J'ai trouve une pomme sur mon bureau"],
    ["You broke the mug", "T'as casse le mug"],
    ["Do you hate me ?", "Tu me deteste ?"]
]

# Data Preprocessing:
---
Before we jump in to our model, a preprocessing part is a must.

## 1. Similar Words Tokenization:
There's no gain in having 3 word vectors for New-York, Havana and Tokyo, they're all city names, the same for America, Germany and Russia being country names. The dataset would be huge if we include training exemples for each city name and our model will learn better if all he knows about those words is their category appartenance, this way, even if we do not have a training exemple for "I went to Tokyo", our model having already been trained on "I went to Paris" will know how to translate the former.
## 2. Sequence Begin & End Tokens:
The Sequence Begin token and the Encoding vector are fed to the decoder to trigger translation, the end sequence token tells us when the translation is over
## 3. Tokenization and Vocab Dictionnary
A vocab dictionnary mapping each word to its woken and another mapping each token to its word are needed to transform our training data into a valid input and output

In [3]:
custom_tokens = {
    'city_token': '<city>',
    'country_token': '<country>',
    'person_token': '<person>'
}

custom_words = {
    'new-york': 'city_token',
    'tokyo': 'city_token',
    'paris': 'city_token',
    'donald': 'person_token',
    'trump': 'person_token',
    'hakim': 'person_token',
    'andrew': 'person_token',
    'france': 'country_token',
    'mexico': 'country_token',
}

Next, we'll define some helper functions to de/tokenize words and create the vocabs:

In [68]:
class TranslationVocab:
    def __init__(self):
        self.ov = self.generate_empty_vocab() # original_vocab
        self.tv = self.generate_empty_vocab() # target_vocab
        self.ov_size = len(self.ov)
        self.tv_size = len(self.tv)
        self.o_max = 0 # original phrases data maximum sequence
        self.t_max = 0 # target phrases data maximum sequence

    def load_from_dataset(self, dataset):
        # dataset is a list of lists, each inner list holds 2 strings ['original', 'translation']
        for sample in dataset:
            [original, target] = sample
            o_splitted = original.lower().strip().split(' ')
            t_splitted = target.lower().strip().split(' ')
            
            # loop over original phrase
            for word in o_splitted:
                self.insert_word_to_vocab(self.ov, word, self.ov_size)

            # loop over target phrase
            for word in t_splitted:
                self.insert_word_to_vocab(self.tv, word, self.tv_size, is_target = True)
                
            # update max-lengths
            self.o_max = max(len(o_splitted), self.o_max)
            self.t_max = max(len(t_splitted) + 2, self.t_max) # +2 for <start> & <end> tokens
            
        return self
            
    def get_token_of(self, word, is_target = False): # get orginal-vocab token of
        vocab = self.ov
        if is_target:
            vocab = self.tv
        if word in vocab:
            return vocab[word]
        return vocab['<unknown>']

    def tokenize_phrase(self, phrase, is_target = False, transformation = lambda x : '<start> ' + x + ' <end>'):
        phrase = phrase.lower().strip()
        if transformation is not None:
            phrase = transformation(phrase)
        tokenized = []
        for word in phrase.split(' '):
            tokenized.append(self.get_token_of(word, is_target))
        # pad to max_len
        max_len = self.o_max
        if is_target:
            max_len = self.t_max
        padding = [self.ov['<pad>']] * (max_len - len(tokenized))
        if len(padding) > 0:
            return tokenized + padding
        return tokenized
                
    def tokenize_dataset(self, dataset): # return numpy arrays
        X = []
        y = []
        for sample in dataset:
            [original, target] = sample
            X.append(np.array(self.tokenize_phrase(original, transformation = None), dtype=np.float32))
            y.append(np.array(self.tokenize_phrase(target, is_target = True), dtype=np.float32))
        return np.array(X), np.array(y)
            
            
    def insert_word_to_vocab(self, vocab, word, index, is_target = False):
        if not word in vocab:
            vocab[word] = index
            if is_target:
                self.tv_size += 1
            else:
                self.ov_size += 1
    
    @staticmethod
    def generate_empty_vocab(): # later to be loaded from a file
        return {
            '<unknown>': 0,
            '<city>': 1,
            '<country>': 2,
            '<person>': 3,
            '<start>': 4,
            '<end>': 5,
            '<pad>': 6,
        }

translation_vocab = TranslationVocab().load_from_dataset(dataset)
print(translation_vocab.ov)
print('---')
print(translation_vocab.tv)
print('---')
print(translation_vocab.get_token_of('?', True))            
print('---')
print(translation_vocab.tokenize_phrase('I love you', is_target = False, transformation = None))

{'<unknown>': 0, '<city>': 1, '<country>': 2, '<person>': 3, '<start>': 4, '<end>': 5, '<pad>': 6, 'i': 7, 'love': 8, 'you': 9, 'what': 10, 'is': 11, 'your': 12, 'name': 13, '?': 14, 'how': 15, 'old': 16, 'are': 17, 'why': 18, 'the': 19, 'world': 20, 'so': 21, 'dumb': 22, 'found': 23, 'dog': 24, 'he': 25, 'died': 26, 'kissed': 27, 'him': 28, 'new-york': 29, 'a': 30, 'beautiful': 31, 'city': 32, 'doing': 33, 'lost': 34, 'hope': 35, 'donald': 36, 'trump': 37, 'worst': 38, 'president': 39, 'do': 40, 'feel': 41, 'bad': 42, 'an': 43, 'apple': 44, 'at': 45, 'my': 46, 'desk': 47, 'broke': 48, 'mug': 49, 'hate': 50, 'me': 51}
---
{'<unknown>': 0, '<city>': 1, '<country>': 2, '<person>': 3, '<start>': 4, '<end>': 5, '<pad>': 6, 'je': 7, "t'aime": 8, 'comment': 9, 'tu': 10, "t'appele": 11, '?': 12, 'quel': 13, 'age': 14, 'as': 15, 'pourquoi': 16, 'le': 17, 'monde': 18, 'est': 19, 'si': 20, 'idiot': 21, "j'ai": 22, 'trouve': 23, 'chien': 24, 'il': 25, 'mort': 26, "l'ai": 27, 'embrasse': 28, 'new-

In [77]:
X, y = translation_vocab.tokenize_dataset(dataset)
X, y = np.expand_dims(X, 2), np.expand_dims(y, 2) # to get a n_samples x Sequence-Length x 1 shape
print(f'training input shape = {X.shape}')
print(f'training output shape = {y.shape}')

training input shape = (15, 7, 1)
training output shape = (15, 9, 1)


# Encoder-Decoder Model