In [2]:
import numpy as np
import pandas as pd
import re
import string
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
df.columns = ['en', 'fr']
df.dropna(inplace=True)

In [20]:
df.iloc[:20]

Unnamed: 0,en,fr,clean_en,clean_fr
0,Hi.,Salut!,hi,salut
1,Run!,Cours !,run,cours
2,Run!,Courez !,run,courez
3,Who?,Qui ?,who,qui
4,Wow!,Ça alors !,wow,ça alors
5,Fire!,Au feu !,fire,au feu
6,Help!,À l'aide !,help,à l'aide
7,Jump.,Saute.,jump,saute
8,Stop!,Ça suffit !,stop,ça suffit
9,Stop!,Stop !,stop,stop


In [15]:
custom_punct = string.punctuation.replace("-","").replace("'","")
def clean(text):
    text = text.lower()
    text = re.sub("["+custom_punct+"]", "", text)
    return text

In [8]:
df["clean_en"] = df["en"].apply(clean)
df["clean_fr"] = df["fr"].apply(clean)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_en"], df["clean_fr"], test_size=0.2)
en_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()

en_tokenizer.fit_on_texts(X_train)
fr_tokenizer.fit_on_texts(y_train)
input_vocab_size = len(en_tokenizer.index_word) + 1
output_vocab_size = len(fr_tokenizer.index_word) + 1
X_train_sequences = en_tokenizer.texts_to_sequences(X_train)
X_test_sequences = en_tokenizer.texts_to_sequences(X_test)

y_train_sequences = fr_tokenizer.texts_to_sequences(y_train)
y_test_sequences = fr_tokenizer.texts_to_sequences(y_test)
maxlen = 55 # max length of all sentences (EN: 48, FR: 55)
X_train_pad = pad_sequences(X_train_sequences, maxlen=maxlen, truncating='post', padding="post")
X_test_pad = pad_sequences(X_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = pad_sequences(y_train_sequences, maxlen=maxlen, truncating='post', padding="post")
y_test_pad = pad_sequences(y_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = y_train_pad.reshape(*y_train_pad.shape, 1)
y_test_pad = y_test_pad.reshape(*y_test_pad.shape, 1)

In [10]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'att': self.att,
            'ffn': self.ffn,
            'layernorm1': self.layernorm1,
            'layernorm2': self.layernorm2,
            'dropout1': self.dropout1,
            'dropout2': self.dropout2,
        })
        return config
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'token_emb': self.token_emb,
            'pos_emb': self.pos_emb,
        })
        return config

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [11]:
num_heads = 3  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
embedding_dim = 200
adam = Adam(learning_rate=0.003)


inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, input_vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x)
x = TimeDistributed(Dense(256, activation="relu"))(x)
outputs = TimeDistributed(Dense(output_vocab_size, activation="softmax"))(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=sparse_categorical_crossentropy, optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 55)]              0         
_________________________________________________________________
token_and_position_embedding (None, 55, 200)           2718600   
_________________________________________________________________
transformer_block (Transform (None, 55, 200)           495832    
_________________________________________________________________
time_distributed (TimeDistri (None, 55, 256)           51456     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 55, 28130)         7229410   
Total params: 10,495,298
Trainable params: 10,495,298
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Fit model
history = model.fit(X_train_pad,
                    y_train_pad,
                    validation_data=(X_test_pad, y_test_pad),
                    verbose=1,
                    batch_size=128,
                    epochs=3,
                   )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
samples = [
    "I am doing a project given by TechIntern",
    "How are you",
    "This is a task.",
    "Prepared a English to French translation website"
]
for sample in samples:
    pred = model.predict([pad_sequences(en_tokenizer.texts_to_sequences([sample]), maxlen=maxlen, padding='post', truncating='post')])[0].argmax(axis=1)
    output_text = fr_tokenizer.sequences_to_texts([pred])[0]
    print("EN: " + sample)
    print("FR: " + output_text)
    print()

EN: I am doing a project given by TechIntern
FR: je fais un un projet à

EN: How are you
FR: comment es tu

EN: This is a task.
FR: c'est une d'une tâche

EN: Prepared a English to French translation website
FR: préparé un anglais à améliorer français le



In [19]:
model.save("MachineTrans.h5")

In [18]:
#Code for creating website
from flask import Flask, render_template, request

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/translate', methods=['POST'])
def translate():
    text = request.form['text']
    translation = translate_text(text, model)
    return render_template('translation.html', translation=translation)

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


In [None]:
#Content of index.html
<!DOCTYPE html>
<html>
<head>
    <title>Translation Website</title>
</head>
<body>
    <h1>English to French Translation</h1>
    <form action="/translate" method="post">
        <textarea name="text" rows="4" cols="50"></textarea><br>
        <input type="submit" value="Translate">
    </form>
</body>
</html>


In [None]:
#Content of translation.html:
<!DOCTYPE html>
<html>
<head>
    <title>Translation Result</title>
</head>
<body>
    <h1>Translation Result</h1>
    <p>{{ translation }}</p>
</body>
</html>
