## Language Translator

Dataset was taken from Kaggle and can be found [**here**](https://www.kaggle.com/jannesklaas/frenchenglish-bilingual-pairs?select=fra.txt)

In [2]:
import nltk
import pandas as pd
import numpy as np
import tensorflow
from tensorflow import keras 
import io
import re
import string
from unicodedata import normalize
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("fra.csv")
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
data.rename(columns={'English words/sentences': 'EN',
                   'French words/sentences': 'FR'},
          inplace=True, errors='raise')
data.head()

Unnamed: 0,EN,FR
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [5]:
print('Number of nulls in EN: {}'.format(data['EN'].isnull().sum()))
print('Number of nulls in FR: {}'.format(data['FR'].isnull().sum()))

Number of nulls in EN: 0
Number of nulls in FR: 0


In [5]:
data.shape

(175621, 2)

In [6]:
df = data.iloc[:20000,:]
df.shape

(20000, 2)

In [7]:
display(df.tail())

Unnamed: 0,EN,FR
19995,It's a pipe dream.,C'est un projet chimérique.
19996,It's a rented car.,C'est une voiture de location.
19997,It's a small town.,C'est une petite ville.
19998,It's a true story.,C'est une histoire vraie.
19999,It's all I can do.,C'est tout ce que je peux faire.


### Text Preprocessing

In [8]:
import string

punctuations = string.punctuation

print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [None]:
df['En_clean'] = df['EN'].apply(lambda x: remove_punct(x))
df['Fr_clean'] = df['FR'].apply(lambda x: remove_punct(x))

In [None]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['En_tokenized'] = df['En_clean'].apply(lambda x: tokenize(x.lower()))
df['Fr_tokenized'] = df['Fr_clean'].apply(lambda x: tokenize(x.lower()))

df.head()

In [12]:
import sentencepiece as sp
 
def write_trainer_file(col, filename):
    texts = list(col.values)
    with open(filename, 'w',encoding='utf-8') as f:
        for text in texts:
            f.write(text + "\n")
            
#use sentences from train_df as our training data
en_sp_trainer = "en_spm.txt"
fr_sp_trainer = "fr_spm.txt"
write_trainer_file(df["EN"], en_sp_trainer)
write_trainer_file(df["FR"], fr_sp_trainer)
 
#create our English SentencePiece model
sp_en_train_param = f"--input={en_sp_trainer} --model_prefix=en_sp --vocab_size=3207"
sp.SentencePieceTrainer.Train(sp_en_train_param)
en_sp = sp.SentencePieceProcessor()
en_sp.Load("en_sp.model")

#create our French SentencePiece model
sp_fr_train_param = f"--input={fr_sp_trainer} --model_prefix=nl_sp --vocab_size=5454"
sp.SentencePieceTrainer.Train(sp_fr_train_param)
fr_sp = sp.SentencePieceProcessor()
fr_sp.Load("nl_sp.model")

True

In [13]:
print(en_sp.EncodeAsPieces("This is a test."))
print(en_sp.EncodeAsIds("This is a test."))
print(en_sp.DecodeIds(en_sp.EncodeAsIds("This is a test.")))

['▁Thi', 's', '▁is', '▁a', '▁test', '.']
[72, 6, 18, 8, 1510, 3]
This is a test.


In [14]:
def encode_sentence(df, lang, spm):
    lang_pieces = []
    lang_lens = []
    for index, row in df.iterrows():
        lang_piece = spm.EncodeAsIds(row[lang])
        lang_pieces.append(lang_piece)
        lang_lens.append(len(lang_piece)) 
    df[f"{lang}_pieces"] = lang_pieces
    df[f"{lang}_len"] = lang_lens
 
encode_sentence(df, "EN", en_sp)
encode_sentence(df, "FR", fr_sp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{lang}_pieces"] = lang_pieces
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{lang}_len"] = lang_lens


In [None]:
df.tail()

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

def plotLangLen(lang1, lang2):
    trace1 = go.Histogram(
        x=df[f"{lang1}_len"].values,
        opacity=0.75,
        name = f"Length of {lang1} sentences",
        marker=dict(color='rgba(171, 50, 96, 0.6)'))
    trace2 = go.Histogram(
        x=df[f"{lang2}_len"].values,
        opacity=0.75,
        name = f"Length of {lang2} sentences",
        marker=dict(color='rgba(12, 50, 196, 0.6)'))
 
    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                       title=f"Lengths of {lang1} and {lang2} sentences",
                       xaxis=dict(title='Length'),
                       yaxis=dict( title='Count'),
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, config={'showLink': True})
 
plotLangLen("EN", "FR")

In [None]:
en_vocab_size = en_sp.get_piece_size()
fr_vocab_size = fr_sp.get_piece_size()
print(f"EN vocab size: {en_vocab_size}")
print(f"FR vocab size: {fr_vocab_size}")

In [None]:
from keras.preprocessing.sequence import pad_sequences

en_vocab_size = en_sp.get_piece_size()
fr_vocab_size = fr_sp.get_piece_size()

en_max_length = df["EN_len"].max()
fr_max_length = df["FR_len"].max()

#we use 30 as length here, to shorten processing time
en_max_length=30
fr_max_length=en_max_length

#use post padding to fill up short sentence with 0
en_padded_seq = pad_sequences(df["EN_pieces"].tolist(), maxlen=en_max_length, padding='post')
fr_padded_seq = pad_sequences(df["FR_pieces"].tolist(), maxlen=fr_max_length, padding='post')
train_seq_df = pd.DataFrame( {'en_seq':en_padded_seq.tolist(), 'fr_seq':fr_padded_seq.tolist()})

In [None]:
display(train_seq_df.head())
display(train_seq_df.tail())

In [None]:
def define_model(input_vocab,output_vocab, input_length,output_length,output_dim):
      model = Sequential()
      #mark_zero , set 0 as special character reserved for unknown words  
      model.add(Embedding(input_vocab, output_dim, input_length=input_length, mask_zero=True))
      model.add(LSTM(output_dim))
      #repeat the input (n) times
      model.add(RepeatVector(output_length))
    #return the full sequences
      model.add(LSTM(output_dim, return_sequences=True))
      #model.add(TimeDistributed(Dense(output_vocab, activation='softmax')))
      
      model.add(Dense(output_vocab, activation='softmax'))
      return model

In [None]:
train_seq_df.shape

In [None]:
train, test = train_test_split(train_seq_df, test_size=0.1, random_state = 3)

In [None]:
trainX = np.asarray(train["fr_seq"].tolist())
trainY = np.asarray(train["en_seq"].tolist())

testX = np.asarray(test["fr_seq"].tolist())
testY = np.asarray(test["en_seq"].tolist())

In [None]:
model = define_model(fr_vocab_size, en_vocab_size, fr_max_length, en_max_length, 1024)

In [None]:
from keras import regularizers, optimizers
from keras.callbacks import ModelCheckpoint   
import tensorflow as tf

# ---------------------------------------------------------------------------------------------------------------

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.001), loss='sparse_categorical_crossentropy')

In [None]:
def encode_output(sequences, vocab_size):
   ylist = list()
   for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
filename = 'nmt_model'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=15, batch_size=64, validation_split = 0.1,callbacks=[checkpoint], 
                    verbose=1)

In [None]:
train, test = train_test_split(train_seq_df, test_size=0.1, random_state = 3)
trainX = np.asarray(train["fr_seq"].tolist())
trainY = np.asarray(train["en_seq"].tolist())
testX = np.asarray(test["fr_seq"].tolist())
testY = np.asarray(test["en_seq"].tolist())
#sparse_categorical_crossentropy for densed target output as integers
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001), loss='sparse_categorical_crossentropy')
filename = 'nmt_model'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=15, batch_size=64, validation_split = 0.1,callbacks=[checkpoint], 
                    verbose=1)