## Language Translator

Dataset was taken from Kaggle and can be found [**here**](https://www.kaggle.com/jannesklaas/frenchenglish-bilingual-pairs?select=fra.txt)

In [1]:
import nltk
import pandas as pd
import numpy as np
import tensorflow
from tensorflow import keras 
import io
import re
import string
from unicodedata import normalize
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers
from sklearn.model_selection import train_test_split
from keras import regularizers, optimizers
from keras.callbacks import ModelCheckpoint   
import tensorflow as tf

In [2]:
data = pd.read_csv("fra.csv")
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [122]:
data.rename(columns={'English words/sentences': 'eng',
                   'French words/sentences': 'fre'},
          inplace=True, errors='raise')
data.head()

Unnamed: 0,eng,fre
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [123]:
print('Number of nulls in EN: {}'.format(data['eng'].isnull().sum()))
print('Number of nulls in FR: {}'.format(data['fre'].isnull().sum()))

Number of nulls in EN: 0
Number of nulls in FR: 0


In [124]:
data.shape

(175621, 2)

In [125]:
df = data.iloc[:50000,:]
df.shape

(50000, 2)

In [126]:
display(df.tail())

Unnamed: 0,eng,fre
49995,Tell us all the gossip.,Racontez-nous tous les commérages !
49996,Ten years have gone by.,Dix ans se sont écoulés.
49997,Tennis is loads of fun.,Le tennis est extrêmement divertissant.
49998,Thank you all for that.,Merci à tous pour cela.
49999,Thank you ever so much.,Grand merci.


In [127]:
df.to_csv('fra_small_v3.csv')

### Load updated dataframe

In [30]:
df = pd.read_csv("fra_small_v3.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,eng,fre
0,0,Hi.,Salut!
1,1,Run!,Cours !
2,2,Run!,Courez !
3,3,Who?,Qui ?
4,4,Wow!,Ça alors !


In [31]:
import string

def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [32]:
df['EN'] = df['eng'].apply(lambda x: remove_punct(x))
df['FR'] = df['fre'].apply(lambda x: remove_punct(x))

In [33]:
df = df.drop(columns=['Unnamed: 0','eng','fre'])
df.head()

Unnamed: 0,EN,FR
0,Hi,Salut
1,Run,Cours
2,Run,Courez
3,Who,Qui
4,Wow,Ça alors


In [34]:
import sentencepiece as sp
 
def write_trainer_file(col, filename):
    texts = list(col.values)
    with open(filename, 'w',encoding='utf-8') as f:
        for text in texts:
            f.write(text + "\n")
            
#use sentences from df as our training data
en_sp_trainer = "en_spm.txt"
fr_sp_trainer = "fr_spm.txt"
write_trainer_file(df["EN"], en_sp_trainer)
write_trainer_file(df["FR"], fr_sp_trainer)
 
#create our English SentencePiece model
sp_en_train_param = f"--input={en_sp_trainer} --model_prefix=en_sp --vocab_size=5147"
sp.SentencePieceTrainer.Train(sp_en_train_param)
en_sp = sp.SentencePieceProcessor()
en_sp.Load("en_sp.model")

#create our French SentencePiece model
sp_fr_train_param = f"--input={fr_sp_trainer} --model_prefix=nl_sp --vocab_size=8713"
sp.SentencePieceTrainer.Train(sp_fr_train_param)
fr_sp = sp.SentencePieceProcessor()
fr_sp.Load("nl_sp.model")

True

In [35]:
print(en_sp.EncodeAsPieces("This is a test"))
print(en_sp.EncodeAsIds("This is a test"))
print(en_sp.DecodeIds(en_sp.EncodeAsIds("This is a test")))

['▁This', '▁is', '▁a', '▁test']
[49, 7, 6, 1042]
This is a test


In [36]:
print(en_sp.EncodeAsPieces("I will take the subway"))
print(en_sp.EncodeAsIds("I will take the subway"))
print(en_sp.DecodeIds(en_sp.EncodeAsIds("I will take the subway")))

['▁I', '▁will', '▁take', '▁the', '▁sub', 'way']
[3, 105, 171, 14, 3253, 2543]
I will take the subway


In [37]:
def encode_sentence(df, lang, spm):
    lang_pieces = []
    lang_lens = []
    for index, row in df.iterrows():
        lang_piece = spm.EncodeAsIds(row[lang])
        lang_pieces.append(lang_piece)
        lang_lens.append(len(lang_piece)) 
    df[f"{lang}_pieces"] = lang_pieces
    df[f"{lang}_len"] = lang_lens
 
encode_sentence(df, "EN", en_sp)
encode_sentence(df, "FR", fr_sp)

In [38]:
display(df.head())
display(df.tail())

Unnamed: 0,EN,FR,EN_pieces,EN_len,FR_pieces,FR_len
0,Hi,Salut,[1946],1,[1884],1
1,Run,Cours,[1533],1,[6715],1
2,Run,Courez,[1533],1,[6782],1
3,Who,Qui,[99],1,[74],1
4,Wow,Ça alors,[2767],1,"[5, 57, 37, 1227]",4


Unnamed: 0,EN,FR,EN_pieces,EN_len,FR_pieces,FR_len
49995,Tell us all the gossip,Raconteznous tous les commérages,"[71, 284, 83, 42, 14, 1840]",6,"[2170, 4718, 102, 33, 7042, 76, 7214]",7
49996,Ten years have gone by,Dix ans se sont écoulés,"[71, 542, 813, 5, 27, 293, 270]",7,"[5264, 688, 60, 42, 2360, 3]",6
49997,Tennis is loads of fun,Le tennis est extrêmement divertissant,"[71, 4704, 7, 3357, 5, 46, 190]",7,"[65, 984, 10, 1569, 182, 5298]",6
49998,Thank you all for that,Merci à tous pour cela,"[1078, 4, 42, 61, 23]",5,"[439, 5, 17, 102, 58, 94]",6
49999,Thank you ever so much,Grand merci,"[1078, 4, 1141, 73, 159]",5,"[4144, 1384]",2


In [39]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

def plotLangLen(lang1, lang2):
    trace1 = go.Histogram(
        x=df[f"{lang1}_len"].values,
        opacity=0.75,
        name = f"Length of {lang1} sentences",
        marker=dict(color='rgba(171, 50, 96, 0.6)'))
    trace2 = go.Histogram(
        x=df[f"{lang2}_len"].values,
        opacity=0.75,
        name = f"Length of {lang2} sentences",
        marker=dict(color='rgba(12, 50, 196, 0.6)'))
 
    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                       title=f"Lengths of {lang1} and {lang2} sentences",
                       xaxis=dict(title='Length'),
                       yaxis=dict( title='Count'),
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, config={'showLink': True})
 
plotLangLen("EN", "FR")

In [40]:
print(en_sp.piece_to_id('__MUST_BE_UNKNOWN__'))
print(en_sp.id_to_piece(0))

0
<unk>


In [41]:
from keras.preprocessing.sequence import pad_sequences
en_vocab_size = en_sp.get_piece_size()
fr_vocab_size = fr_sp.get_piece_size()
en_max_length = df["EN_len"].max()
fr_max_length = df["FR_len"].max()
#we use 16 as length here, to shorten processing time
en_max_length=12
fr_max_length=en_max_length
#use post padding to fill up short sentence with 0
en_padded_seq = pad_sequences(df["EN_pieces"].tolist(), maxlen=en_max_length, padding='post')
fr_padded_seq = pad_sequences(df["FR_pieces"].tolist(), maxlen=fr_max_length, padding='post')
train_seq_df = pd.DataFrame( {'en_seq':en_padded_seq.tolist(), 'fr_seq':fr_padded_seq.tolist()})

In [42]:
en_padded_seq[4]

array([2767,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [43]:
display(train_seq_df.tail())
display(train_seq_df.shape)

Unnamed: 0,en_seq,fr_seq
49995,"[71, 284, 83, 42, 14, 1840, 0, 0, 0, 0, 0, 0]","[2170, 4718, 102, 33, 7042, 76, 7214, 0, 0, 0, 0, 0]"
49996,"[71, 542, 813, 5, 27, 293, 270, 0, 0, 0, 0, 0]","[5264, 688, 60, 42, 2360, 3, 0, 0, 0, 0, 0, 0]"
49997,"[71, 4704, 7, 3357, 5, 46, 190, 0, 0, 0, 0, 0]","[65, 984, 10, 1569, 182, 5298, 0, 0, 0, 0, 0, 0]"
49998,"[1078, 4, 42, 61, 23, 0, 0, 0, 0, 0, 0, 0]","[439, 5, 17, 102, 58, 94, 0, 0, 0, 0, 0, 0]"
49999,"[1078, 4, 1141, 73, 159, 0, 0, 0, 0, 0, 0, 0]","[4144, 1384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


(50000, 2)

In [44]:
def define_model(input_vocab,output_vocab, input_length,output_length,output_dim):
      model = Sequential()
      #mark_zero , set 0 as special character reserved for unknown words  
      model.add(Embedding(input_vocab, output_dim, input_length=input_length, mask_zero=True))
      model.add(LSTM(output_dim))
      #repeat the input (n) times
      model.add(RepeatVector(output_length))
    #return the full sequences
      model.add(LSTM(output_dim, return_sequences=True))     
      model.add(Dense(output_vocab, activation='softmax'))
      return model

In [45]:
train, test = train_test_split(train_seq_df, test_size=0.1, random_state = 3)

In [46]:
trainX = np.asarray(train["fr_seq"].tolist())
trainY = np.asarray(train["en_seq"].tolist())

testX = np.asarray(test["fr_seq"].tolist())
testY = np.asarray(test["en_seq"].tolist())

In [47]:
model = define_model(fr_vocab_size, en_vocab_size, fr_max_length, en_max_length, 1024)

In [49]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy')
filename = 'nmt_model_v2'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# train model
history = model.fit(trainX, trainY,
                    epochs=15, batch_size=64, validation_split = 0.1,callbacks=[checkpoint], 
                    verbose=1)

Epoch 1/15

Epoch 00001: val_loss improved from inf to 1.63238, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 2/15

Epoch 00002: val_loss improved from 1.63238 to 1.37175, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 3/15

Epoch 00003: val_loss improved from 1.37175 to 1.19899, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 4/15

Epoch 00004: val_loss improved from 1.19899 to 1.08463, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 5/15

Epoch 00005: val_loss improved from 1.08463 to 1.01252, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 6/15

Epoch 00006: val_loss improved from 1.01252 to 0.96192, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 7/15

Epoch 00007: val_loss improved from 0.96192 to 0.94140, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 8/15

Epoch 00008: val_loss improved from 0.94140 to 0.93229, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 9/15

Epoch 00009: val_loss improved from 0.93229 to 0.92799, saving model to nmt_model_v2




INFO:tensorflow:Assets written to: nmt_model_v2\assets


INFO:tensorflow:Assets written to: nmt_model_v2\assets


Epoch 10/15

Epoch 00010: val_loss did not improve from 0.92799
Epoch 11/15

Epoch 00011: val_loss did not improve from 0.92799
Epoch 12/15

Epoch 00012: val_loss did not improve from 0.92799
Epoch 13/15

Epoch 00013: val_loss did not improve from 0.92799
Epoch 14/15

Epoch 00014: val_loss did not improve from 0.92799
Epoch 15/15

Epoch 00015: val_loss did not improve from 0.92799


In [50]:
trace1 = go.Scatter(
    y=history.history['loss'],
    name = "Training Loss",
    marker=dict(color='rgba(171, 50, 96, 0.6)'))
trace2 = go.Scatter(
    y=history.history['val_loss'],
    name = "Validation Loss",
    marker=dict(color='rgba(12, 50, 196, 0.6)'))

data = [trace1, trace2]
layout = go.Layout(title='Loss and Val_Loss in 15 Epochs',
                   xaxis=dict(title='Epoch'),
                   yaxis=dict(title='Loss'),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, config={'showLink': True})

In [51]:
model = load_model('nmt_model_v2')

In [52]:
predict_x=model.predict(testX)
classes_x=predict_x.argmax(axis=1)

In [53]:
def get_word(ids, tokenizer):
    return tokenizer.DecodeIds(list(filter(lambda a: a != 0, ids.tolist())))

test_ids = []
test_frs = []
test_ens = []
test_mts = []
for y_index in range(len(testY)): 
  test_ids.append(y_index)
  test_frs.append(get_word(testX[y_index], fr_sp))
  test_ens.append(get_word(testY[y_index], en_sp))
  test_mts.append(get_word(classes_x[y_index], en_sp))
 
predict_df = pd.DataFrame( {'id':test_ids, 'FR':test_frs, 'EN':test_ens, 'MT':test_mts})

In [54]:
pd.set_option('display.max_colwidth', 80)

In [55]:
predict_df.sample(10)

Unnamed: 0,id,FR,EN,MT
2522,2522,En avezvous terminé,Are you finished,m you I I I I you I you I I you you I I I I I you I I you I you I you I I I ...
316,316,Nous sommes des nouveauxvenus,Were newcomers,is you you I I I Is I you I you I I you you I you you I you you I I I I I yo...
3442,3442,Je partage ton avis,I share your opinion,is you you you you you you I I you you you I you I I I I I I I I I I I I I I...
2811,2811,Tom est plus vieux que moi,Tom is older than me,mssssssss I Iss I I Is Iss Is Iss I I Is I I I I Is I I I I I you I I I I I ...
1385,1385,Pourquoi devraisje y aller,Why should I go,you you you I I I I I you I you I I you I you I you I I I you I I I you I I ...
3738,3738,Posez le fusil,Put down the rifle,to you you yous yous yous you I you I you yous you you I you you you I you y...
1335,1335,Nessaye pas ça à la maison,Dont try this at home,to I Is I Iss I you I I I I you I I I I I I I I I I I I I you I I I I I I I ...
1427,1427,Le dîner est prêt,Dinners ready,Tom I I you you you I you you you you you you
4778,4778,Tu es grossière,Youre rude,m I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I I ...
4985,4985,Par ici,Walk this way,to I
