# Neural Machine Translation with Bidirectional Encoder-Decoder LSTMs:

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
dataset = [
    ["I love you", "je t'aime"],
    ["What is your name ?", "Comment tu t'appele ?"],
    ["How old are you", "Quel age as tu"],
    ["why is the world so dumb", "pourquoi le monde est si idiot"],
    ["I found the dog", "j'ai trouve le chien"],
    ["He died", "Il est mort"],
    ["I kissed him", "Je l'ai embrasse"],
    ["New-York is a beautiful city", "New-York est une belle ville"],
    ["what are you doing", "Que ce que tu fait"],
    ["I lost hope", "J'ai perdu l'espoir"],
    ["Donald Trump is the worst president", "Donald Trump est le pire president"],
    ["Do you feel bad", "Tu te sens mauvais"],
    ["I found an apple at my desk", "J'ai trouve une pomme sur mon bureau"],
    ["You broke the mug", "T'as casse le mug"],
    ["Do you hate me ?", "Tu me deteste ?"]
]

# Data Preprocessing:
---
Before we jump in to our model, a preprocessing part is a must.

## 1. Similar Words Tokenization:
There's no gain in having 3 word vectors for New-York, Havana and Tokyo, they're all city names, the same for America, Germany and Russia being country names. The dataset would be huge if we include training exemples for each city name and our model will learn better if all he knows about those words is their category appartenance, this way, even if we do not have a training exemple for "I went to Tokyo", our model having already been trained on "I went to Paris" will know how to translate the former.
## 2. Sequence Begin & End Tokens:
The Sequence Begin token and the Encoding vector are fed to the decoder to trigger translation, the end sequence token tells us when the translation is over
## 3. Tokenization and Vocab Dictionnary
A vocab dictionnary mapping each word to its woken and another mapping each token to its word are needed to transform our training data into a valid input and output

In [3]:
custom_tokens = {
    'city_token': '<city>',
    'country_token': '<country>',
    'person_token': '<person>'
}

custom_words = {
    'new-york': 'city_token',
    'tokyo': 'city_token',
    'paris': 'city_token',
    'donald': 'person_token',
    'trump': 'person_token',
    'hakim': 'person_token',
    'andrew': 'person_token',
    'france': 'country_token',
    'mexico': 'country_token',
}

Next, we'll define some helper functions to de/tokenize words and create the vocabs:

In [28]:
class TranslationVocab:
    def __init__(self):
        self.ov = self.generate_empty_vocab() # original_vocab
        self.tv = self.generate_empty_vocab() # target_vocab
        self.ov_size = len(self.ov)
        self.tv_size = len(self.tv)

    def load_from_dataset(self, dataset):
        # dataset is a list of lists, each inner list holds 2 strings ['original', 'translation']
        for sample in dataset:
            original, target = sample[0], sample[1]

            # loop over original phrase
            for word in original.lower().strip().split(' '):
                self.insert_word_to_vocab(self.ov, word, self.ov_size)

            # loop over target phrase
            for word in target.lower().strip().split(' '):
                self.insert_word_to_vocab(self.tv, word, self.tv_size, is_target = True)
        return self
            
    def get_ov_token(self, word): # get orginal-vocab token of
        if word in self.ov:
            return self.ov[word]
        return self.ov['<unknown>']
    
    def get_tv_token(self, word): # get target-vocab token of
        if word in self.tv:
            return self.tv[word]
        return self.tv['<unknown>']
    
    def insert_word_to_vocab(self, vocab, word, index, is_target = False):
        if not word in vocab:
            vocab[word] = index
            if is_target:
                self.tv_size += 1
            else:
                self.ov_size += 1
                
    
    @staticmethod
    def generate_empty_vocab(): # later to be loaded from a file
        return {
            '<unknown>': 0,
            '<city>': 1,
            '<country>': 2,
            '<person>': 3,
        }

translation_vocab = TranslationVocab().load_from_dataset(dataset)
print(translation_vocab.ov)
print('---')
print(translation_vocab.tv)
print('---')
print(translation_vocab.get_ov_token('?'))            
  

{'<unknown>': 0, '<city>': 1, '<country>': 2, '<person>': 3, 'i': 4, 'love': 5, 'you': 6, 'what': 7, 'is': 8, 'your': 9, 'name': 10, '?': 11, 'how': 12, 'old': 13, 'are': 14, 'why': 15, 'the': 16, 'world': 17, 'so': 18, 'dumb': 19, 'found': 20, 'dog': 21, 'he': 22, 'died': 23, 'kissed': 24, 'him': 25, 'new-york': 26, 'a': 27, 'beautiful': 28, 'city': 29, 'doing': 30, 'lost': 31, 'hope': 32, 'donald': 33, 'trump': 34, 'worst': 35, 'president': 36, 'do': 37, 'feel': 38, 'bad': 39, 'an': 40, 'apple': 41, 'at': 42, 'my': 43, 'desk': 44, 'broke': 45, 'mug': 46, 'hate': 47, 'me': 48}
---
{'<unknown>': 0, '<city>': 1, '<country>': 2, '<person>': 3, 'je': 4, "t'aime": 5, 'comment': 6, 'tu': 7, "t'appele": 8, '?': 9, 'quel': 10, 'age': 11, 'as': 12, 'pourquoi': 13, 'le': 14, 'monde': 15, 'est': 16, 'si': 17, 'idiot': 18, "j'ai": 19, 'trouve': 20, 'chien': 21, 'il': 22, 'mort': 23, "l'ai": 24, 'embrasse': 25, 'new-york': 26, 'une': 27, 'belle': 28, 'ville': 29, 'que': 30, 'ce': 31, 'fait': 32, '