# French to English Translator using Deep Learning
This project aims to build a French to English translator using an artificial Recurrent Neural Network (RNN) called Long Short-Term Memory (LSTM).

In [1]:
%%capture
# install Tensorflow
!pip install tensorflow

# Loading spacy's French and English languages
!pip install -U spacy
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

In [2]:
# Import required packages
import numpy as np
import pandas as pd
import fr_core_news_sm
import en_core_web_sm
import tensorflow as tf
tf.__version__

'2.15.0'

### Importing data

The data comes from a `.txt` file containing more than 160000 sentences with their translation separated by a tab (`\t`).

The data can be found on this link: https://go.aws/38ECHUB

For performance purposes, we will not take the whole dataset but a sample of 5000 sentences instead. This will allow us to faster iterate and avoid bugs related to our need for computer power.

In [3]:
# Loading data
doc = pd.read_csv("https://go.aws/38ECHUB", delimiter="\t", header=None)
doc.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !


In [4]:
doc.shape

(160538, 2)

In [5]:
# Let's just take a sample of 5000 sentences to avoid slowness
doc = doc.sample(5000)

In [6]:
# Loading of the entire corpus of French and English sentences
fr_corpus = " ".join(doc.iloc[:, 1].to_list())
en_corpus = " ".join(doc.iloc[:, 0].to_list())

### Preprocessing

The main purpose of the preprocessing step is to express each French entry sentence in a sequence of clues.

i.e.:

* I'm sick ---> $[123, 21, 34, 0, 0, 0, 0, 0]$

This gives a *shape* -> `(batch_size, max_len_of_a_sentence)`.

The clues correspond to a number that we will have to assign for each word token.

The zeros correspond to what are called [*padded_sequences*](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences) which allow all word sequences to have the same length (mandatory for our algorithm).

The transformation of our target sentences will not be exactly the same as that of our input sentences. In addition to all the steps we will have performed for the input sentences, we will also have to *categorize* our target sentences. In other words, an example tensor would look like :

* I am sick ---> $\begin{bmatrix} 1&0&0&...&0&0 \\ 0&0&0&...&1&0 \\ ... \\ 0&1&0&...&0&0 \end{bmatrix}$

This gives a *shape* -> `(batch_size, max_len_of_an_english_sentence, num_of_classes)`.

To do this, we are going to use :

* `Spacy` for Tokenization
* `Tensorflow` for [padded_sequence](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences) & [categorization](https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical?hl=en)

In [7]:
# Loading both corpora into spacy
nlp_fr = fr_core_news_sm.load()
nlp_fr.max_length = len(fr_corpus)

nlp_en = en_core_web_sm.load()
nlp_en.max_length = len(en_corpus)

fr_doc = nlp_fr(fr_corpus)
en_doc = nlp_en(en_corpus)

In [8]:
# Tokenization of each sentence via spacy
doc["fr_tokens"] = doc.iloc[:, 1].apply(nlp_fr.tokenizer)
doc["en_tokens"] = doc.iloc[:, 0].apply(nlp_en.tokenizer)

In [9]:
doc.tail()

Unnamed: 0,0,1,fr_tokens,en_tokens
33067,I love your daughter.,J'aime votre fille.,"(J', aime, votre, fille, .)","(I, love, your, daughter, .)"
132482,They supplied the war victims with food.,Ils dispensèrent de la nourriture aux victimes...,"(Ils, dispensèrent, de, la, nourriture, aux, v...","(They, supplied, the, war, victims, with, food..."
88694,I thought that you could swim.,Je pensais que tu pouvais nager.,"(Je, pensais, que, tu, pouvais, nager, .)","(I, thought, that, you, could, swim, .)"
68364,Do you plan to go overseas?,Prévois-tu de te rendre outre-mer ?,"(Prévois, -, tu, de, te, rendre, outre-mer, ?)","(Do, you, plan, to, go, overseas, ?)"
61764,Everybody had a good year.,Tout le monde a eu une bonne année.,"(Tout, le, monde, a, eu, une, bonne, année, .)","(Everybody, had, a, good, year, .)"


In [10]:
# Creation of a set() that will take all the unique tokens from our text corpus
en_tokens = [token.text for token in en_doc]
en_vocabulary_set= set(en_tokens)
en_vocab_size = len(en_vocabulary_set)
print(en_vocab_size)

3595


In [11]:
# Same thing for French
fr_tokens = [token.text for token in fr_doc]
fr_vocabulary_set= set(fr_tokens)
fr_vocab_size = len(fr_vocabulary_set)
print(fr_vocab_size)

5018


In [12]:
# Creation of an id for each token
all_en_tokens = {en_token: i + 1 for i, en_token in enumerate(en_vocabulary_set)}
all_fr_tokens = {fr_token: i + 1 for i, fr_token in enumerate(fr_vocabulary_set)}
# RQ: We take at i+1 to leave the value 0 for the creation of the padded_sequences

In [13]:
# Creation of functions that will create a vector of indices for each of the token sequences
def en_tokens_to_index(tokens):
    return [all_en_tokens[token.text] for token in tokens]

def fr_tokens_to_index(tokens):
    return [all_fr_tokens[token.text] for token in tokens]

In [14]:
# Transformation of tokens into indices
doc["fr_indices"] = doc["fr_tokens"].apply(fr_tokens_to_index)
doc["en_indices"] = doc["en_tokens"].apply(en_tokens_to_index)

In [15]:
doc.tail()

Unnamed: 0,0,1,fr_tokens,en_tokens,fr_indices,en_indices
33067,I love your daughter.,J'aime votre fille.,"(J', aime, votre, fille, .)","(I, love, your, daughter, .)","[1294, 4302, 2740, 3885, 1725]","[15, 41, 1339, 61, 1408]"
132482,They supplied the war victims with food.,Ils dispensèrent de la nourriture aux victimes...,"(Ils, dispensèrent, de, la, nourriture, aux, v...","(They, supplied, the, war, victims, with, food...","[2575, 4414, 2342, 315, 1466, 5013, 1106, 2342...","[3544, 2841, 1338, 1368, 3310, 2504, 2303, 1408]"
88694,I thought that you could swim.,Je pensais que tu pouvais nager.,"(Je, pensais, que, tu, pouvais, nager, .)","(I, thought, that, you, could, swim, .)","[3813, 522, 982, 3739, 4728, 2891, 1725]","[15, 2814, 138, 3535, 558, 3134, 1408]"
68364,Do you plan to go overseas?,Prévois-tu de te rendre outre-mer ?,"(Prévois, -, tu, de, te, rendre, outre-mer, ?)","(Do, you, plan, to, go, overseas, ?)","[2630, 3403, 3739, 2342, 2133, 421, 2416, 3905]","[3270, 3535, 451, 386, 2567, 235, 441]"
61764,Everybody had a good year.,Tout le monde a eu une bonne année.,"(Tout, le, monde, a, eu, une, bonne, année, .)","(Everybody, had, a, good, year, .)","[2269, 4755, 352, 979, 129, 988, 2969, 4853, 1...","[2640, 2893, 2702, 393, 871, 1408]"




TypeError: '<' not supported between instances of 'spacy.tokens.doc.Doc' and 'spacy.tokens.doc.Doc'

from matplotlib import pyplot as plt
import seaborn as sns
_df_2.groupby('fr_tokens').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

TypeError: '<' not supported between instances of 'spacy.tokens.doc.Doc' and 'spacy.tokens.doc.Doc'

from matplotlib import pyplot as plt
import seaborn as sns
_df_3.groupby('en_tokens').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

KeyError: '1'

from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
    x_label: grp['fr_tokens'].value_counts()
    for x_label, grp in _df_5.groupby('1')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('1')
_ = plt.ylabel('fr_tokens')

TypeError: '<' not supported between instances of 'spacy.tokens.doc.Doc' and 'spacy.tokens.doc.Doc'

from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
    x_label: grp['en_tokens'].value_counts()
    for x_label, grp in _df_6.groupby('fr_tokens')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('fr_tokens')
_ = plt.ylabel('en_tokens')

In [16]:
# Use of Keras to create token sequences of the same length
padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["fr_indices"], padding="post")
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["en_indices"], padding="post")

In [17]:
# Visualization of the shape of one of the tensors
padded_fr_indices.shape

(5000, 37)

In [18]:
padded_en_indices.shape

(5000, 28)

**RQ:** the maximum length of the english sentences (28) is different from the maximum length of the french sentences (27).

In [19]:
doc["fr_indices"].apply(len).max()

37

In [20]:
doc["en_indices"].apply(len).max()

28

In [21]:
# Application of the categorization of the target variable
binarized_en_indices = tf.keras.utils.to_categorical(padded_en_indices, num_classes=en_vocab_size+1)
binarized_en_indices.shape

(5000, 28, 3596)

In [22]:
# Creation of tf.data.Dataset for each of the French and English tensors
fr_ds = tf.data.Dataset.from_tensor_slices(padded_fr_indices)
en_ds = tf.data.Dataset.from_tensor_slices(binarized_en_indices)

In [23]:
# Create a complete tensorflow dataset
tf_ds = tf.data.Dataset.zip((fr_ds, en_ds))

In [24]:
next(iter(tf_ds))

(<tf.Tensor: shape=(37,), dtype=int32, numpy=
 array([1294, 3059,  646, 1149, 1725,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0], dtype=int32)>,
 <tf.Tensor: shape=(28, 3596), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)>)

In [25]:
# Shuffle & Batch
BATCH_SIZE = 32

tf_ds = tf_ds.shuffle(len(doc)).batch(BATCH_SIZE)

In [26]:
# Train Test Split
TAKE_SIZE = int(0.7 * len(doc) / BATCH_SIZE)

train_data = tf_ds.take(TAKE_SIZE)
test_data = tf_ds.skip(TAKE_SIZE)

### Modeling

Let's move on to modeling. To create our model, we are going to use:

* A layer of [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding?hl=en)
* 2 [LSTM](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM?hl=en) & [Bidirectional](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Bidirectional?hl=en) layers
* A [RepeatVector](https://www.tensorflow.org/api_docs/python/tf/keras/layers/RepeatVector?hl=en) layer
* A [Dense](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?hl=en) & [TimeDistributed](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TimeDistributed?hl=en) output layer

The objective being to have in input, a tensor of dimension `(batch_size, max_len_of_french_sentences)` and in output a tensor of dimension `(batch_size, max_len_of_english_sentences, num_of_classes)` where obviously `max_len_of_english_sentences` $\neq $ `max_len_of_french_sentences`.

In [27]:
# Create the model
model = tf.keras.Sequential([
                  # Input Word Embedding layer
                  tf.keras.layers.Embedding(fr_vocab_size + 1, 64, mask_zero=True),

                  # LSTM Bidirectional layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),

                  # LSTM Bidirectionnal new layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),

                  # Repeat Vector
                  tf.keras.layers.RepeatVector(binarized_en_indices.shape[1]),

                  # LSTM new layer
                  tf.keras.layers.LSTM(32, return_sequences=True),

                  # Output layer with number of output neurons equal to class number with softmax function
                  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(en_vocab_size+1, activation="softmax"))

])

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          321216    
                                                                 
 bidirectional (Bidirection  (None, None, 128)         66048     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 repeat_vector (RepeatVecto  (None, 28, 128)           0         
 r)                                                              
                                                                 
 lstm_2 (LSTM)               (None, 28, 32)            20608     
                                                        

In [29]:
# "Random" prediction to test our model
input_text, output_text = next(iter(train_data))
print(input_text.numpy().shape)
print(model.predict(input_text).shape)
print(output_text.numpy().shape)

(32, 37)
(32, 28, 3596)
(32, 28, 3596)


In [30]:
# Let's create a learning rate schedule to decrease the learning rate as we train the model
initial_learning_rate = 0.001

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1090,
    decay_rate=0.96,
    staircase=True)

# Using a simple compiler with an Adam optimizer to compute our gradients
optimizer= tf.keras.optimizers.Adam(
    learning_rate = lr_schedule
)

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [31]:
# Application of the model on 200 epochs
history = model.fit(train_data,
                    validation_data=test_data,
                    epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [32]:
# Testing a translation
for input_text, translation in test_data.take(1):
    pred = np.argmax(model.predict(input_text), axis=-1)



In [33]:
# CAUTION: it's possible because there is only one key for each value
indice_to_fr_token = dict(zip(all_fr_tokens.values(), all_fr_tokens.keys()))
indice_to_en_token = dict(zip(all_fr_tokens.values(), all_en_tokens.keys()))

In [34]:
# French Sentence
for indice in input_text[0]:
    if indice == 0:
        break
    print(indice_to_fr_token[indice.numpy()])

Ses
idées
sur
l'
éducation
sont
très
différentes
des
miennes
.


In [35]:
# Real English Sentence
for indice in np.argmax(translation, axis=-1)[0]:
    if indice == 0:
        break
    print(indice_to_en_token[indice])

Her
ideas
on
education
are
very
different
from
mine
.


In [36]:
# Sentence translated into English by the model
for indice in pred[0]:
    if indice == 0:
        break
    print(indice_to_en_token[indice])

He
is
a
to
the
the
the
the
.
.


In [37]:
# Training on 500 more epochs
history_2 = model.fit(train_data,
                      validation_data=test_data,
                      epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
# Testing on new translations

for input_text, translation in test_data.take(1):
    pred = np.argmax(model.predict(input_text), axis=-1)

# French
print("Input Sentence:", end=" ")
for indice in input_text[4]:
    if indice == 0:
        break
    print(indice_to_fr_token[indice.numpy()], end=" ")

# True
print("\nTrue Translation:", end=" ")
for indice in np.argmax(translation, axis=-1)[2]:
    if indice == 0:
        break
    print(indice_to_en_token[indice], end=" ")

# Pred
print("\nModel Translation:", end=" ")
for indice in pred[2]:
    if indice == 0:
        break
    print(indice_to_en_token[indice], end=" ")

Input Sentence: Je pars . 
True Translation: We 're not sure . 
Model Translation: You 're not not . 

After 2 training steps, the model reached an accuracy of 0.89 on the validation set. Nevertheless, we still notice some weaknesses guessing the time used or translating the verbal group.

A solution may be first to run the model on the whole dataset to increase the amount of training data, and then adapt the number of layers in our neural network.