In [1]:
import numpy as np 
import pandas as pd 
import plotly.express as px
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras import layers
from colorama import Fore, Style

In [2]:
data = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

In [3]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [23]:
data["English Words in Sentence"] = (data["English words/sentences"].str.split().apply(len))
data["French Words in Sentence"] = (data["French words/sentences"].str.split().apply(len))

In [6]:
data

Unnamed: 0,English words/sentences,French words/sentences,English Words in Sentence,French Words in Sentence
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç...",34,47
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...,34,33
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...,37,47
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...,43,49


In [7]:
#plot
fig = px.histogram(
    data,
    x=["English Words in Sentence", "French Words in Sentence"],
    color_discrete_sequence=["#3f384a", "#e04c5f"],
    labels = {"variable": "Variable", "value": "Words in Sentence"},
    marginal = "box",
    barmode = "group",
    height=540,
    width=840,
    title="Words in Sentence",
)

fig.update_layout(
    font_color="#141B4D",
    title_font_size=18,
    plot_bgcolor="#F6F5F5",
    paper_bgcolor="#F6F5F5",
    bargap=0.2,
    bargroupgap=0.1,
    legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
    yaxis_title="Count",
)

fig.show()

In [24]:
from sklearn.model_selection import train_test_split

sentences_en = data["English words/sentences"].to_numpy()
sentences_fr = data["French words/sentences"].to_numpy()

sentences_en_train, sentences_en_valid, sentences_fr_train, sentences_fr_valid = train_test_split(sentences_en, sentences_fr, test_size=0.1, random_state=42)

In [25]:
def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, "startofseq " + sentences_fr), sentences_fr + " endofseq"

def from_sentences_dataset(
    sentences_en,
    sentences_fr,
    batch_size=32,
    cache=True,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=None,
):
    dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_fr))
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
    if cache:
        dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)

<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    margin-left: 8px;
    margin-right: 8px;
">
    <li>In the original paper, to implement Position Encoding, the sine and cosine functions were used:</br>
    \[PE_{(pos, 2i)}     = \sin\left(pos\big/10000^{2i\big/d_{model}}\right)\]
    \[PE_{(pos, 2i + 1)} = \cos\left(pos\big/10000^{2i\big/d_{model}}\right)\]
    where $pos$ is the position, $i$ is the dimension and $d_{model}$ has the same dimension as the embeddings.</li>

In [26]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_sentence_len=50, embedding_size=256, dtype=tf.float32, **kwargs):
        
        super().__init__(dtype=dtype, **kwargs)
        if not embedding_size % 2 == 0:
            raise ValueError("The `embedding_size` must be even.")

        p, i = np.meshgrid(np.arange(max_sentence_len), np.arange(embedding_size // 2))
        #shape of p and i: (embedding_size // 2, max_sentence_len)
        pos_emb = np.empty((1, max_sentence_len, embedding_size))
        #shape: (1, max_sentence_len, embedding_size)
        pos_emb[:, :, 0::2] = np.sin(p / 10_000 ** (2 * i / embedding_size)).T
        #shape of np.sin(p / 10_000 ** (2 * i / embedding_size)): (embedding_size // 2, max_sentence_len)
        #shape: (1, max_sentence_len, embedding_size // 2)
        pos_emb[:, :, 1::2] = np.cos(p / 10_000 ** (2 * i / embedding_size)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.positional_embedding[:, :batch_max_length]

In [27]:
class Encoder(layers.Layer):
    def __init__(
        self,
        embedding_size=256,
        n_attention_heads=8,
        n_units_dense=256,
        dropout_rate=0.2,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.multi_head_attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        
        self.feed_forward = keras.Sequential([
            layers.Dense(n_units_dense, activation ="relu", kernel_initializer="he_normal"),
            layers.Dense(embedding_size, kernel_initializer="he_normal"),
            layers.Dropout(dropout_rate),
        ])
        
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()
        
    def call(self, inputs, mask=None): 
        x = inputs
        skip_x = x
        x = self.multi_head_attention(x, value = x, attention_mask = mask)
        x = self.normalization(self.add([x, skip_x]))
        
        x = skip_x
        x = self.feed_forward(x)
        return self.normalization(self.add([x, skip_x]))

In [28]:
class Decoder(layers.Layer):
    def __init__(
        self,
        embedding_size=256,
        n_attention_heads=8,
        n_units_dense=256,
        dropout_rate=0.2,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.masked_multi_head_attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        self.multi_head_attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        self.feed_forward = keras.Sequential([
            layers.Dense(n_units_dense, activation="relu", kernel_initializer="he_normal"),
            layers.Dense(embedding_size, kernel_initializer="he_normal"),
            layers.Dropout(dropout_rate),
        ])
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()
        
    def call(self, inputs, mask=None):
        decoder_mask, encoder_mask = mask 
        
        x, encoder_output = inputs
        x_skip = x
        x = self.masked_multi_head_attention(x, value=x, attention_mask=decoder_mask)
        x = self.normalization(self.add([x, x_skip]))
        
        x_skip = x
        x = self.multi_head_attention(x, value=encoder_output, attention_mask=encoder_mask)
        x = self.normalization(self.add([x, x_skip]))
        
        x_skip = x
        x = self.feed_forward(x)
        return self.normalization(self.add([x, x_skip]))

In [29]:
class Transformer(keras.Model):
    def __init__(
        self,
        vocabulary_size=5000,
        max_sentence_len=50,
        embedding_size=256,
        n_blocks=1,
        n_attention_heads=8,
        n_units_dense=256,
        dropout_rate=0.2,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len
        
        self.vectorization_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.vectorization_fr = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        
        self.encoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.positional_encoding = PositionalEncoding(max_sentence_len, embedding_size)
        
        self.encoder_blocks = [
            Encoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_blocks)
        ]
        self.decoder_blocks = [
            Decoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_blocks)
        ]
        
        self.output_layer = layers.Dense(vocabulary_size, activation="softmax")
        
    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        #vectorize sentence
        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)
        #embedding id
        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)
        #positional embedding
        encoder_pos_embeddings = self.positional_encoding(encoder_embeddings)
        decoder_pos_embeddings = self.positional_encoding(decoder_embeddings)
        #pad_mask
        encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
        
        decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
        batch_max_len_decoder = tf.shape(decoder_embeddings)[1]
        decoder_causal_mask = tf.linalg.band_part(tf.ones((batch_max_len_decoder, batch_max_len_decoder), tf.bool), -1, 0)
        decoder_mask = decoder_causal_mask & decoder_pad_mask
        
        #encoder
        x = encoder_pos_embeddings
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x, mask=encoder_pad_mask)
            
        #decoder
        encoder_output = x
        x = decoder_pos_embeddings
        for decoder_block in self.decoder_blocks:
            x = decoder_block([x, encoder_output], mask=[decoder_mask, encoder_pad_mask])

        return self.output_layer(x)

In [14]:
CLR = (Style.BRIGHT + Fore.BLACK) 
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN

class ColoramaVerbose(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(
            f"{CLR}Epoch: {RED}{epoch + 1:02d}{CLR} -",
            f"{CLR}loss: {RED}{logs['loss']:.5f}{CLR} -",
            f"{CLR}accuracy: {RED}{logs['accuracy']:.5f}{CLR} -",
            f"{CLR}val_loss: {RED}{logs['val_loss']:.5f}{CLR} -",
            f"{CLR}val_accuracy: {RED}{logs['val_accuracy']:.5f}",
        )

In [30]:
def adapt_compile_and_fit(
    model,
    train_dataset,
    valid_dataset,
    n_epochs=1,
    n_patience=5,
    init_lr=0.001,
    lr_decay_rate=0.1,
    colorama_verbose=False,
):
    
    model.vectorization_en.adapt(
        train_dataset.map(
            lambda sentences, target : sentences[0],
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )
    
    model.vectorization_fr.adapt(
        train_dataset.map(
            lambda sentences, target : sentences[1] + " endofseq" ,
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )
    
    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)
    
    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)
    
    early_stopping_cb = keras.callbacks.EarlyStopping(
        monitor = "val_accuracy",
        patience = n_patience,
        restore_best_weights=True
    )
    
    n_decay_steps = n_epochs * len(list(train_dataset_prepared))
    scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=init_lr,
        decay_steps=n_decay_steps,
        decay_rate=lr_decay_rate,
    )
    
    model_callbacks = [early_stopping_cb]
    verbose_level = 1
    if colorama_verbose:
        model_callbacks.append(ColoramaVerbose())
        verbose_level = 0
        
    model.compile(
        loss = "sparse_categorical_crossentropy",
        optimizer = keras.optimizers.RMSprop(learning_rate = scheduled_lr),
        metrics = ["accuracy"],
    )
    
    model.fit(
        train_dataset_prepared,
        epochs = n_epochs,
        validation_data=valid_dataset_prepared,
        callbacks= model_callbacks,
        verbose = verbose_level,
    )

In [31]:
#training
train_data = from_sentences_dataset(sentences_en_train, sentences_fr_train, shuffle=True, seed=42)
valid_data = from_sentences_dataset(sentences_en_valid, sentences_fr_valid)

K = keras.backend
K.clear_session()
tf.random.set_seed(42)

transformer = Transformer(max_sentence_len=15)
transformer_history = adapt_compile_and_fit(transformer, train_data, valid_data, colorama_verbose=True)


Layer 'sequential' (of type Sequential) was passed an input with a mask attached to it. However, this layer does not support masking and will therefore destroy the mask information. Downstream layers will not see the mask.


Gradients do not exist for variables ['kernel', 'bias', 'kernel', 'bias', 'kernel', 'bias', 'kernel', 'bias'] when minimizing the loss. If using `model.compile()`, did you forget to provide a `loss` argument?



[1m[30mEpoch: [1m[31m01[1m[30m - [1m[30mloss: [1m[31m2.06532[1m[30m - [1m[30maccuracy: [1m[31m0.66547[1m[30m - [1m[30mval_loss: [1m[31m1.60262[1m[30m - [1m[30mval_accuracy: [1m[31m0.72230


In [49]:
def translate(model, sentence_en):
    translation = ""
    for i in range(model.max_sentence_len):
        X_encoder = np.array([sentence_en])
        X_decoder = np.array(["startofseq " + translation])
        
        y_prob = model.predict((X_encoder, X_decoder), verbose = 0)[0, i]
        print(y_prob)
        predicted_word_id = np.argmax(y_prob)
        predicted_word = model.vectorization_fr.get_vocabulary()[predicted_word_id]
        print(predicted_word)
        
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
#Test
translation1 = translate(transformer, "Take a seat")
translation2 = translate(transformer, "I wish Tom was here.")
translation3 = translate(transformer, "She ordered him to do it.")

print(CLR + "Actual Possible Translations:")
print(BLUE + "Take a seat".ljust(25), RED + "-> ", BLUE + "Prends place !")
print(
    BLUE + "I wish Tom was here.".ljust(25),
    RED + "-> ",
    BLUE + "J'aimerais que Tom soit là.",
)
print(
    BLUE + "She ordered him to do it.".ljust(25),
    RED + "-> ",
    BLUE + "Elle lui a ordonné de le faire.",
)
print()
print(CLR + "Model Translations:")
print(BLUE + "Take a seat".ljust(25), RED + "-> ", BLUE + translation1)
print(BLUE + "I wish Tom was here.".ljust(25), RED + "-> ", BLUE + translation2)
print(BLUE + "She ordered him to do it.".ljust(25), RED + "-> ", BLUE + translation3)