In [12]:
# Translation from eng to french
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
#from keras.utils.sequence import pad_sequences

from keras.layers import LSTM, Embedding, Input, Dense
from keras.models import Sequential, Model

import re
import nltk
nltk.download("stopwords")
nltk.download('punkt')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
df = pd.read_csv('/content/eng_-french.csv')
df.columns = ['English', 'French']
df.head()

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [10]:
df.tail(20)

Unnamed: 0,English,French
175601,A good theory is characterized by the fact tha...,Une bonne théorie se caractérise par le fait d...
175602,"An Earth-like planet, which is believed to hav...","Une planète semblable à la Terre, qui aurait d..."
175603,The more time you spend speaking a foreign lan...,Plus l'on passe de temps à parler une langue é...
175604,"The enquiry concluded that, despite his denial...",L'enquête conclut qu'en dépit de ses dénégatio...
175605,Roger Miller's father died when he was only on...,Le père de Roger Miller est décédé lorsqu'il a...
175606,You may not learn to speak as well as a native...,Peut-être n'apprendrez-vous pas à parler comme...
175607,And the good news is that today the economy is...,Et la bonne nouvelle est qu'aujourd'hui l'écon...
175608,E-cigarettes are being promoted as a healthy a...,La cigarette électronique est mise en avant co...
175609,It's still too hard to find a job. And even if...,C'est encore trop difficile de trouver un empl...
175610,"Even at the end of the nineteenth century, sai...","Même à la fin du dix-neuvième siècle, les mari..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  175621 non-null  object
 1   French   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [9]:
data = df[:]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  175621 non-null  object
 1   French   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


# Clean the dataset 

In [13]:
def clean_eng(text):
  text = text.lower()
  # remove all value except a-z and ?!
  text = re.sub(u"[^a-z!?,]"," ",text)
  # tokenise - word wise
  text = nltk.word_tokenize(text)
  # join text
  text = " ".join([i.strip() for i in text])
  return text

clean_eng(data.iloc[0,0])

'hi'

In [14]:
data.iloc[1,0], clean_eng(data.iloc[1,0])

('Run!', 'run !')

In [16]:
def clean_french(text):
  text = text.lower()
  # remove all value except a-z and ?!
  text = re.sub(u"[^a-zéâàçêêëôîû!?,]"," ",text)

  return text

clean_french(data.iloc[0,1])

'salut!'

In [17]:
data.iloc[2,1], clean_french(data.iloc[2,1])

('Courez\u202f!', 'courez !')

In [18]:
data.head(2)

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !


In [19]:
data['English'] = data['English'].apply(lambda txt:clean_eng(txt))
data['French'] = data['French'].apply(lambda txt:clean_french(txt))

In [20]:
data.head()

Unnamed: 0,English,French
0,hi,salut!
1,run !,cours !
2,run !,courez !
3,who ?,qui ?
4,wow !,ça alors !


In [24]:
# add <start> <end> token to decoder sentence 
data['French'] = data['French'].apply(lambda txt: f"<start> {txt} <end>")

In [25]:
data

Unnamed: 0,English,French
0,hi,<start> salut! <end>
1,run !,<start> cours ! <end>
2,run !,<start> courez ! <end>
3,who ?,<start> qui ? <end>
4,wow !,<start> ça alors ! <end>
...,...,...
175616,"top down economics never works , said obama th...",<start> l économie en partant du haut vers l...
175617,a carbon footprint is the amount of carbon dio...,<start> une empreinte carbone est la somme de ...
175618,death is something that we re often discourage...,<start> la mort est une chose qu on nous décou...
175619,since there are usually multiple websites on a...,<start> puisqu il y a de multiples sites web s...


In [None]:
# Tokenization and Build Vocabulary

In [28]:
# english tokenization
english_tokenize = Tokenizer(filters="#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n")
english_tokenize.fit_on_texts(data['English'])

In [29]:
english_tokenize

<keras.preprocessing.text.Tokenizer at 0x7fbed7816700>

In [30]:
num_encoder_tokens = len(english_tokenize.word_index)
num_encoder_tokens

13905

In [32]:
encoder = english_tokenize.texts_to_sequences(data['English'])
encoder[:5]

[[2752], [417, 124], [417, 124], [76, 5], [3489, 124]]

In [33]:
max_encoder_sequence_len = np.max([len(enc) for enc in encoder])
max_encoder_sequence_len

47

In [34]:
# french tokenization
french_tokenize = Tokenizer(filters="#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n")
french_tokenize.fit_on_texts(data['French'])

In [35]:
num_decoders_tokens = len(french_tokenize.word_index)
num_decoders_tokens

24129

In [36]:
decoder = french_tokenize.texts_to_sequences(data['French'])
decoder[:5]

[[2, 15399, 1],
 [2, 551, 40, 1],
 [2, 4807, 40, 1],
 [2, 46, 6, 1],
 [2, 38, 381, 40, 1]]

In [37]:
max_decoder_sequence_len = np.max([len(dec) for dec in decoder])
max_decoder_sequence_len

61

In [41]:
idx_2_txt_decoder = {k:i for i , k in french_tokenize.word_index.items()}
idx_2_txt_decoder[1]

'end'

In [43]:
idx_2_txt_encoder = {k:i for i , k in english_tokenize.word_index.items()}
idx_2_txt_encoder[2]

'you'

In [44]:
idx_2_txt_decoder[0] = "<pad>"
idx_2_txt_encoder[0] = "<pad>"

In [48]:
# pad sequence is required in both encoder and decoder

from keras.preprocessing import sequence
from keras.utils import pad_sequences
encoder_seq = pad_sequences(encoder, maxlen=max_encoder_sequence_len, padding="post")
encoder_seq

array([[2752,    0,    0, ...,    0,    0,    0],
       [ 417,  124,    0, ...,    0,    0,    0],
       [ 417,  124,    0, ...,    0,    0,    0],
       ...,
       [ 607,    8,   99, ...,    0,    0,    0],
       [ 361,   47,   27, ...,    0,    0,    0],
       [  65,  276,   76, ...,    6, 1100, 1448]], dtype=int32)

In [49]:
encoder_seq.shape

(175621, 47)

In [50]:
decoder_inp = pad_sequences(decoder, maxlen=max_decoder_sequence_len, padding="post")
decoder_inp.shape

(175621, 61)

# Build LSTM Model

In [59]:
from keras.layers import Activation #SpatialDropuout1D
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
# Encoder model
encoder_input = Input(shape=(None,), name="encoder_input_layer")
encoder_embedding = Embedding(num_encoder_tokens, 300, input_length=max_encoder_sequence_len,
                              name="encoder_embedding_layer")(encoder_input)
encoder_lstm = LSTM(256,activation='tanh', return_sequences=True, return_state=True,
                    name="encoder_lstm1_layer")(encoder_embedding)
encoder_lstm2 = LSTM(256,activation='tanh', return_sequences=True, return_state=True,
                    name="encoder_lstm2_layer")(encoder_lstm)

_, state_h, state_c = encoder_lstm2
encoder_states = [state_h, state_c]

In [60]:
# Decoder model
decoder_input = Input(shape=(None,), name="decoder_input_layer")
decoder_embedding = Embedding(num_decoders_tokens, 300, input_length=max_decoder_sequence_len,
                              name="decoder_embedding_layer")(decoder_input)
decoder_lstm = LSTM(256,activation='tanh', return_sequences=True, return_state=True,
                    name="decoder_lstm1_layer")
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state= encoder_states)
decoder_dense = Dense(num_decoders_tokens +1, activation='softmax',name='decoder_final_layer')
outputs = decoder_dense(decoder_output)

In [61]:
model =Model([encoder_input , decoder_input], outputs)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_layer (InputLaye  [(None, None)]      0           []                               
 r)                                                                                               
                                                                                                  
 encoder_embedding_layer (Embed  (None, None, 300)   4171500     ['encoder_input_layer[0][0]']    
 ding)                                                                                            
                                                                                                  
 decoder_input_layer (InputLaye  [(None, None)]      0           []                               
 r)                                                                                         

In [65]:
model.compile(optimizer='adam', loss=tf.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
history = model.fit([encoder_seq, decoder_inp],decoder_output, epochs=1, batch_size=64)



TypeError: ignored