In [1]:
import string
import re
import numpy as np
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
#% matplotlib inline
pd.set_option('display.max_colwidth', 200)

ImportError: cannot import name 'pad_sequences' from 'keras.preprocessing.sequence' (c:\users\rutik\appdata\local\programs\python\python39\lib\site-packages\keras\preprocessing\sequence.py)

Defining the functions which will enable us to read the text file

In [3]:
def read_text(filename):
    # open the file
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    file.close()
    return text

In [5]:
def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [6]:
data = read_text("fra.txt")
fra_eng = to_lines(data)
fra_eng = array(fra_eng)

In [None]:
fra_eng

### We're limiting our sentence pairs to 75000 for ease of training

In [None]:
fra_eng = fra_eng[:75000,:]

In [None]:
fra_eng

In [None]:
#Deleting all the punctuations in both the english and french phrases by using maketrans
fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]

In [None]:
fra_eng

In [None]:
#Converting the text to lower case
for i in range(len(fra_eng)):
    fra_eng[i,0] = fra_eng[i,0].lower()
    
    fra_eng[i,1] = fra_eng[i,1].lower()

In [None]:
eng_l = []
fra_l = []

# populate the language lists with sentence lengths
for i in fra_eng[:,0]:
    eng_l.append(len(i.split()))

for i in fra_eng[:,1]:
    fra_l.append(len(i.split()))

In [None]:
length_df = pd.DataFrame({'eng':eng_l, 'fra':fra_l})

In [None]:
length_df.hist(bins = 30)
plt.show()

#The plot shows frequency of occurence v/s length of phrase for both languages

In [None]:
length_df['eng'].value_counts()
#We can see that the maximum length sequence in english is 8

In [None]:
length_df['fra'].value_counts()
#We can see that the maximum length sequence in French is 14

In [None]:
#Tokenization is the process of converting each word in the vocabulary into an integer based on frequency of occurence
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

eng_tokenizer = tokenization(fra_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)

fra_tokenizer = tokenization(fra_eng[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1

fra_length = 14
print('French Vocabulary Size: %d' % fra_vocab_size)


In [None]:
# encode and pad sequences
#encoding means replacing each word with its corresponding number
#Padding essentially means adding zeros to make the length of every sequence equal
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq 

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(fra_eng, test_size=0.2, random_state = 12)

In [None]:
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

## Now we'll build the Sequential model.
### The first layer is the embedding layer which projects each token in an N dimensional vector space
### LSTM is the artificial recurrent neural net architecture.
### It can not only proces past data but take feedback from future data as well.

### In the second LSTM layer, we have set return sequences as True becuase we need outputs of all hidden units and not just the last one.

In [None]:
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

model = build_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
model.summary()

In [None]:
filename = 'model.h1.25_sep_20'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=30, batch_size=512, 
          validation_split = 0.2,
          callbacks=[checkpoint], verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

In [None]:
model = load_model('model.h1.25_sep_20')
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [None]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0: #If it is not the first word
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):  #if the next word is same as the previous
                temp.append('')
            else:
                temp.append(t)
             
        else: #if it's not the first word
            if(t == None): #if we didn't get a valid code from dictionary 
                temp.append('')
            else:
                temp.append(t)            
        
    preds_text.append(' '.join(temp))

In [None]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})
pd.set_option('display.max_colwidth', 200)

In [None]:
pred_df.tail(25)

In [None]:

from nltk.translate.bleu_score import sentence_bleu
sumn = 0
for i in range(len(pred_df)):
    reference = pred_df['actual'][0]
    candidate = pred_df['predicted'][1]
    score = sentence_bleu([pred_df['actual'][i].split()],pred_df['predicted'][i].split())
    sumn+=score
    
print("The average BLEU score for the translation is {:.2f} %".format(sumn*100/len(pred_df)))

#Here we have calculated bleu score for every translation and taken an average