In [1]:
# importing all the necessary libraries
import numpy as np
import pandas as pd
import string

import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding, Input, Bidirectional, LSTM, Dense,RepeatVector,Flatten

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu


## Text Processing

In [33]:
# reading text from file and converting it to lowercase
text = open('/kaggle/input/fra-eng-nmt/fra.txt', 'r', encoding='utf-8').read()
text = text.lower()

In [34]:
# printing some text given in text 
print(text[:1000])

go.	va !	cc-by 2.0 (france) attribution: tatoeba.org #2877272 (cm) & #1158250 (wittydev)
go.	marche.	cc-by 2.0 (france) attribution: tatoeba.org #2877272 (cm) & #8090732 (micsmithel)
go.	en route !	cc-by 2.0 (france) attribution: tatoeba.org #2877272 (cm) & #8267435 (felix63)
go.	bouge !	cc-by 2.0 (france) attribution: tatoeba.org #2877272 (cm) & #9022935 (micsmithel)
hi.	salut !	cc-by 2.0 (france) attribution: tatoeba.org #538123 (cm) & #509819 (aiji)
hi.	salut.	cc-by 2.0 (france) attribution: tatoeba.org #538123 (cm) & #4320462 (gillux)
run!	cours !	cc-by 2.0 (france) attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)
run!	courez !	cc-by 2.0 (france) attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)
run!	prenez vos jambes à vos cous !	cc-by 2.0 (france) attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)
run!	file !	cc-by 2.0 (france) attribution: tatoeba.org #906328 (papabear) & #2077454 (sacredceltic)
run!	filez !	cc-by 2.0 (fr

In [35]:
# splitiing text line by line and printing 1st line
sentences = text.split('\n')
print(sentences[0])

go.	va !	cc-by 2.0 (france) attribution: tatoeba.org #2877272 (cm) & #1158250 (wittydev)


In [36]:
# splitiing text
sentences = [i.split('\t')[:-1] for i in sentences]
print(sentences[0])

['go.', 'va !']


In [37]:
# punctuations used for removing from text
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [38]:
# removing punctuations
for i,l in enumerate(sentences):
    for j,s in enumerate(l):
        for p in punctuations:
            s = s.replace(p,'')
        sentences[i][j] = s
    

In [39]:
# sentences looks like after removing punctuations
sentences[350:355]

[['im hit', 'je suis touchée '],
 ['im ill', 'je suis malade'],
 ['im sad', 'je suis triste'],
 ['im sad', 'jai un coup de cafard'],
 ['im sad', 'je suis malheureux']]

In [40]:
# adding keys to decoder text so that model recognizes where to stop predicting
for i in range(len(sentences)):
    if len(sentences[i]) != 0:
        sentences[i][1] = '<start> '+sentences[i][1]+' <end>'
sentences[0:5]

[['go', '<start> va  <end>'],
 ['go', '<start> marche <end>'],
 ['go', '<start> en route  <end>'],
 ['go', '<start> bouge  <end>'],
 ['hi', '<start> salut  <end>']]

## splitting the data

In [41]:
# We use only 10% of data for training due to less resources.
train,test = train_test_split(sentences,test_size=0.90,random_state=123)

In [42]:
len(train),len(test)

(20890, 188017)

In [43]:
# splitting eng and fra train data
train_eng,train_fra = [],[]
for i in train:
    if len(i) != 0:
        train_eng.append(i[0])
        train_fra.append(i[1])

train_eng[0],train_fra[0]

('i checked the list', '<start> jai vérifié la liste <end>')

In [44]:
# splitting eng and fra test data
test_eng,test_fra = [],[]
for j in test:
    if len(j) != 0:
        test_eng.append(j[0])
        test_fra.append(j[1])
        
test_eng[0],test_fra[0]

('you had better make sure that he is at home before you call on him',
 '<start> vous devriez vous assurer quil est chez lui avant de lui rendre visite <end>')

## Data Preparation for Training

In [45]:
# creating tokenizer for converting text to numbers
tokenizer_eng = Tokenizer(oov_token='<UNKE>',filters = '')
tokenizer_fra = Tokenizer(oov_token='<UNKF>',filters = '')

In [46]:
# fitting tokenizer according to words in train data set
tokenizer_eng.fit_on_texts(train_eng)
tokenizer_fra.fit_on_texts(train_fra)

In [47]:
# total vocab size of both languages(train)
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_fra = len(tokenizer_fra.word_index) + 1
vocab_size_eng,vocab_size_fra

(6900, 12497)

In [48]:
# converting text to sequences according to tokenizer
sequences_eng = tokenizer_eng.texts_to_sequences(train_eng)
sequences_fra = tokenizer_fra.texts_to_sequences(train_fra)

# output for training a model lag by one timestep
sequences_fra_out = [i[1:] for i in sequences_fra]

In [49]:
sequences_eng[0],sequences_fra[0],sequences_fra_out[0]

([2, 1343, 5, 784], [2, 24, 2006, 10, 890, 3], [24, 2006, 10, 890, 3])

In [51]:
# getting max seq length of both languages(train)
max_seq_len_eng = max([len(seq) for seq in sequences_eng])
max_seq_len_fra = max([len(seq) for seq in sequences_fra])
max_seq_len_eng,max_seq_len_fra

(34, 49)

In [52]:
# padding to get the data of same length
padded_sequences_eng = pad_sequences(sequences_eng, maxlen = max_seq_len_eng,padding = 'post')
padded_sequences_fra = pad_sequences(sequences_fra, maxlen = max_seq_len_fra,padding = 'post')
padded_sequences_fra_out = pad_sequences(sequences_fra_out, maxlen = max_seq_len_fra,padding = 'post')

In [53]:
# Final Shape of data
print(padded_sequences_eng.shape)
print(padded_sequences_fra.shape)
print(padded_sequences_fra_out.shape)

(20890, 34)
(20890, 49)
(20890, 49)


In [54]:
# creating generator to load data to model batch by batch t
def train_generator(eng,fra,fra_oh,batch=128):
    l = len(eng)
    count=0
    while count <= l:
        eng_data = eng[count:count+batch]
        fra_data = fra[count:count+batch]
        fra_out_data = fra_oh[count:count+batch]
        # converting output data into one hot encoding
        fra_out_data = keras.utils.to_categorical(fra_out_data,num_classes = vocab_size_fra)
        count += batch
        yield [eng_data,fra_data],fra_out_data

## Model Creation and Training

In [55]:
# Model Pipiline

len_embed = 256
## encoder
# english imput data
enc_inp = Input(shape=(None,))   
# creating an embedding layer to convert data into different dimensions
enc_emb = Embedding(vocab_size_eng,len_embed,mask_zero = True)(enc_inp)  
# pass embedded text to lstm which return states
enc_lstm = LSTM(len_embed,return_state = True)

# out states of lstm
enc_out,enc_st_h,enc_st_c = enc_lstm(enc_emb)
enc_state = [enc_st_h,enc_st_c] 

In [46]:
## decoder
# framce imput data
dec_inp = Input(shape=(None,))
# creating an embedding layer to convert data into different dimensions
dec_emb_layer = Embedding(vocab_size_fra,len_embed,mask_zero=True)
dec_emb = dec_emb_layer(dec_inp)
# pass embedded text to lstm which return output
dec_lstm = LSTM(len_embed,return_sequences=True,return_state=True)
dec_out,_,_ = dec_lstm(dec_emb,initial_state = enc_state)
# passing out of decoder through dense layer
dec_dense = Dense(vocab_size_fra,activation='softmax')
dec_dense_out = dec_dense(dec_out)

# Model Completed
model = Model([enc_inp,dec_inp],dec_dense_out)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    1766400     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    3199232     ['input_2[0][0]']                
                                                                                              

In [47]:
# Compiling Model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [48]:
# Training Model for 50 epochs
batch=128
for i in range(50):
    model.fit(train_generator(padded_sequences_eng,padded_sequences_fra,padded_sequences_fra_oh,batch),steps_per_epoch=len(padded_sequences_eng)//batch)



## Testing

In [52]:
# model for prediction
# for english data prediction
enc_model = Model(enc_inp,enc_state)

# taking an empty input tensor for decoder states
dec_st_h = Input(shape=(len_embed,))
dec_st_c = Input(shape=(len_embed,))
dec_st = [dec_st_h,dec_st_c]

dec_tst_emb = dec_emb_layer(dec_inp)

# passing decoder text to decoder lstm layer and saving out and states for passing it to next time for prediction
dec_tst_out,dec_tst_h,dec_tst_c = dec_lstm(dec_tst_emb,initial_state = dec_st)
dec_tst_st = [dec_tst_h,dec_tst_c]
dec_tst_den_out = dec_dense(dec_tst_out)

# decoder Model Completed
dec_model = Model([dec_inp]+dec_st, [dec_tst_den_out]+dec_tst_st)

In [65]:
# for saving model
model.save('enc_dec.h5')
dec_model.save('dec_test.h5')

In [60]:
# prediction and bleu score calculation
for i in range(10):
    print(i+1)
    predicted = '<start>'
    inp1 = [test_eng[i]]
    inp1 = tokenizer_eng.texts_to_sequences(inp1)
    inp1 = pad_sequences(inp1, maxlen = max_seq_len_eng,padding = 'post')
    inp_st = enc_model.predict(inp1,verbose=0)
    inp2 = ['<start>']
    inp2 = np.array(tokenizer_fra.texts_to_sequences(inp2))

    while True:
        out,h,c = dec_model.predict([inp2]+inp_st,verbose=0)
        inp_st = [h,c]
        out = np.argmax(out[0][-1]) 
        inp2[0][0] = out
        
        for k,v in tokenizer_fra.word_index.items():
            if out == v:
                out = k
                break
        predicted += ' ' + out

        if out == '<end>' or len(predicted.split()) > 47:
            break

    
    print('Eng :- ',test_eng[i])
    print('Fra actual :- ',test_fra[i])
    print('Fra predicted :- ',predicted)
    print('Bleu Score :- ',corpus_bleu([test_fra[i][8:-6]],[predicted[:-6]]))
                   

1
Eng :-  you had better make sure that he is at home before you call on him
Fra actual :-  <start> vous devriez vous assurer quil est chez lui avant de lui rendre visite <end>
Fra predicted :-  <start> tu ferais mieux de vous dire combien de la police était juste à la maison <end>
Bleu Score :-  0.6447788067558897
2
Eng :-  im not accustomed to eating this kind of food
Fra actual :-  <start> je ne suis pas habitué à manger ce genre de nourriture <end>
Fra predicted :-  <start> je ne suis pas qualifié mais je ne le serai pas en retard <end>
Bleu Score :-  0.6812455364200614
3
Eng :-  youre generous
Fra actual :-  <start> tu es généreux <end>
Fra predicted :-  <start> tu es fort <end>
Bleu Score :-  0.7598356856515925
4
Eng :-  i remember reading about it
Fra actual :-  <start> je me rappelle avoir lu à ce propos <end>
Fra predicted :-  <start> je me souviens avoir lu à ce sujet <end>
Bleu Score :-  0.7598356856515925
5
Eng :-  its about time
Fra actual :-  <start> il est temps  <end>
F