<a href="https://colab.research.google.com/github/VardanDavtyan/ML-DL/blob/main/EnglishFrenchTranslator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
import string
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("eng_-french.csv")
df.columns=["en","fr"]
df

Unnamed: 0,en,fr
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [None]:
df.iloc[10:20]

Unnamed: 0,en,fr
10,Stop!,Arrête-toi !
11,Wait!,Attends !
12,Wait!,Attendez !
13,Go on.,Poursuis.
14,Go on.,Continuez.
15,Go on.,Poursuivez.
16,Hello!,Bonjour !
17,Hello!,Salut !
18,I see.,Je comprends.
19,I try.,J'essaye.


In [None]:
custom_punct = string.punctuation.replace("-","").replace("'","")
def clean(text):
    text = text.lower()
    text = re.sub("["+custom_punct+"]", "", text)
    return text

In [None]:
df["clean_en"] = df["en"].apply(clean)
df["clean_fr"] = df["fr"].apply(clean)

df

Unnamed: 0,en,fr,clean_en,clean_fr
0,Hi.,Salut!,hi,salut
1,Run!,Cours !,run,cours
2,Run!,Courez !,run,courez
3,Who?,Qui ?,who,qui
4,Wow!,Ça alors !,wow,ça alors
...,...,...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç...",top-down economics never works said obama the ...,« l'économie en partant du haut vers le bas ça...
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...,a carbon footprint is the amount of carbon dio...,une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...,death is something that we're often discourage...,la mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...,since there are usually multiple websites on a...,puisqu'il y a de multiples sites web sur chaqu...


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_en"], df["clean_fr"], test_size=0.2)

en_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()

en_tokenizer.fit_on_texts(X_train)
fr_tokenizer.fit_on_texts(y_train)

input_vocab_size = len(en_tokenizer.index_word) + 1
output_vocab_size = len(fr_tokenizer.index_word) + 1

X_train_sequences = en_tokenizer.texts_to_sequences(X_train)
X_test_sequences = en_tokenizer.texts_to_sequences(X_test)

y_train_sequences = fr_tokenizer.texts_to_sequences(y_train)
y_test_sequences = fr_tokenizer.texts_to_sequences(y_test)

In [None]:
maxlen = 55 # max length of all sentences (EN: 44, FR: 55)
X_train_pad = pad_sequences(X_train_sequences, maxlen=maxlen, truncating='post', padding="post")
X_test_pad = pad_sequences(X_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = pad_sequences(y_train_sequences, maxlen=maxlen, truncating='post', padding="post")
y_test_pad = pad_sequences(y_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = y_train_pad.reshape(*y_train_pad.shape, 1)
y_test_pad = y_test_pad.reshape(*y_test_pad.shape, 1)

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
            'rate': self.rate
            #'att': self.att,
            #'ffn': self.ffn,
            #'layernorm1': self.layernorm1,
            #'layernorm2': self.layernorm2,
            #'dropout1': self.dropout1,
            #'dropout2': self.dropout2,
        })
        return config
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):

    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)


    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'maxlen': self.maxlen,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim,
            #'token_emb': self.token_emb,
            #'pos_emb': self.pos_emb,
        })
        return config

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


    @classmethod
    def from_config(cls, config):
        # Use config to create a new instance of the layer
        return cls(**config)

In [None]:
num_heads = 3  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
embedding_dim = 200
adam = Adam(learning_rate=0.003)


inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, input_vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x)
x = TimeDistributed(Dense(256, activation="relu"))(x)
outputs = TimeDistributed(Dense(output_vocab_size, activation="softmax"))(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=sparse_categorical_crossentropy, optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 55)]              0         
                                                                 
 token_and_position_embeddi  (None, 55, 200)           2723400   
 ng_3 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_3 (Trans  (None, 55, 200)           495832    
 formerBlock)                                                    
                                                                 
 time_distributed_6 (TimeDi  (None, 55, 256)           51456     
 stributed)                                                      
                                                                 
 time_distributed_7 (TimeDi  (None, 55, 28188)         7244

In [58]:
history = model.fit(X_train_pad,
                    y_train_pad,
                    validation_data=(X_test_pad, y_test_pad),
                    verbose=1,
                    batch_size=128,
                    epochs=1,
                   )



Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7cbd24f1a830>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7cbd24f1a830>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


In [59]:
from tensorflow.keras.models import save_model
save_model(model, "en_to_fr_model.h5")

  save_model(model, "en_to_fr_model.h5")


In [62]:
from google.colab import files
files.download("en_to_fr_model.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [60]:
from tensorflow.keras.models import load_model
model = load_model("en_to_fr_model.h5", custom_objects={'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'TransformerBlock': TransformerBlock})

In [61]:
samples = [
    "turn on the light",
    "i love you",
    "good morning",
    "In the morning i went to the school and then to the work"
]
for sample in samples:
    pred = model.predict([pad_sequences(en_tokenizer.texts_to_sequences([sample]), maxlen=maxlen, padding='post', truncating='post')])[0].argmax(axis=1)
    output_text = fr_tokenizer.sequences_to_texts([pred])[0]
    print("EN: " + sample)
    print("FR: " + output_text)
    print()

Cause: Unable to locate the source code of <function Model.make_predict_function.<locals>.predict_function at 0x7cbd249fdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_predict_function.<locals>.predict_function at 0x7cbd249fdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
EN: turn on the light
FR: allume le la

EN: i love you
FR: j'adore t'aime

EN: good morning
FR: bonne

EN: In the morning i went to the school and then to the work
FR: au le matin matin à à à l'école à à

