Your Task is to:
1. Build a Neural Machine Translation model.
2. Evaluate your model using BLEU score.

Dataset: http://www.manythings.org/anki/fra-eng.zip

Notes
1. Refer notes for hint.

## Downloading the ZIP file

In [1]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2022-03-19 22:17:09--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 2606:4700:8d7e:7d56:abdd:10:b19e:a55c, 172.67.186.54, 104.21.92.44
Connecting to www.manythings.org (www.manythings.org)|2606:4700:8d7e:7d56:abdd:10:b19e:a55c|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6532197 (6.2M) [application/zip]
Saving to: 'fra-eng.zip'

     0K .......... .......... .......... .......... ..........  0% 3.93M 2s
    50K .......... .......... .......... .......... ..........  1%  709K 5s
   100K .......... .......... .......... .......... ..........  2%  762K 6s
   150K .......... .......... .......... .......... ..........  3% 1.32M 6s
   200K .......... .......... .......... .......... ..........  3% 7.12M 5s
   250K .......... .......... .......... .......... ..........  4% 1.54M 5s
   300K .......... .......... .......... .......... ..........  5% 4.62M 4s
   350K .......... .......... .......... .......... ....

## Extracting the Zip file

In [2]:
import zipfile
zip_ = zipfile.ZipFile('fra-eng.zip')
zip_.extractall()

## Import required Dependencies

In [3]:
import string,re
from unicodedata import normalize
from numpy import array,argmax
from pickle import load,dump
from numpy.random import rand,shuffle

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential,load_model
from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
from nltk.translate.bleu_score import SmoothingFunction,corpus_bleu
smoothie = SmoothingFunction().method4

## Loading the file and reading the content of the file

In [4]:
# load file into memory

def load_file(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

## Splitting the sentence into pairs

In [5]:
# split a loaded document into sentences

def splitting_sentence(doc):
    sentences = doc.strip().split('\n')
    pairs = [sentence.split('\t') for sentence in  sentences]
    return pairs

## Cleaning the pairs

In [6]:
# cleaning a list of sentences and creating pairs

def clean_pairs(sentences):
    cleaned = list()
 
    # preparing regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    # preparing translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)

    # iterating over each pair
    for pair in sentences:
        clean_pair = list()
  
        for sentence in pair:
            # normalizing unicode characters
            sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
            sentence = sentence.decode('UTF-8')
            # tokenizing on white space
            sentence = sentence.split()
            # converting to lowercase
            sentence = [word.lower() for word in sentence]
            # removing punctuation from each token
            sentence = [word.translate(table) for word in sentence]
            # removing non-printable chars form each token
            sentence = [re_print.sub('', w) for w in sentence]
            # removing tokens with numbers in them
            sentence = [word for word in sentence if word.isalpha()]
            # storing as string
            clean_pair.append(' '.join(sentence))
        cleaned.append(clean_pair)
    return array(cleaned)

## Saving the Cleaned data

In [7]:
def saving_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(filename,': Saved')

## Saving data in .pkl format

In [8]:
# load dataset

filename = 'fra.txt'
doc = load_file(filename)

# split into english-french pairs
pairs = splitting_sentence(doc)

# clean sentences
clean_pairs = clean_pairs(pairs)

# save clean pairs to file
saving_clean_data(clean_pairs, 'english-french.pkl')

print('English','-->',"French")
# spot check
for i in range(25):
    print(clean_pairs[i,0],'-->',clean_pairs[i,1])

english-french.pkl : Saved
English --> French
go --> va
go --> marche
go --> bouge
hi --> salut
hi --> salut
run --> cours
run --> courez
run --> prenez vos jambes a vos cous
run --> file
run --> filez
run --> cours
run --> fuyez
run --> fuyons
run --> cours
run --> courez
run --> prenez vos jambes a vos cous
run --> file
run --> filez
run --> cours
run --> fuyez
run --> fuyons
who --> qui
wow --> ca alors
duck --> a terre
duck --> baissetoi


## Loading the cleaned data

In [9]:
# load a clean dataset
def loading_cleaned_data(filename):
    return load(open(filename, 'rb'))

In [10]:
# load dataset
data = loading_cleaned_data('english-french.pkl')
print(data.shape) 

(192341, 3)


### Scaling of data

### Size

 1.Dataset - 20000

 2.Training - 18000

 3.Testing - 2000

In [11]:
# reducing dataset size (scaling) 

new_data_size = 20000
dataset = data[:new_data_size, :]

# randomly shuffling the dataset to get proper training and testing data
shuffle(dataset)

# splitting into training and testing (90%-10%)
train, test = dataset[:18000], dataset[18000:]

# saving the cleaned data,train data and test data 
saving_clean_data(dataset, 'english-french-both.pkl')
saving_clean_data(train, 'english-french-train.pkl')
saving_clean_data(test, 'english-french-test.pkl')

english-french-both.pkl : Saved
english-french-train.pkl : Saved
english-french-test.pkl : Saved


In [12]:
# loading datasets and saving it into variables
dataset = loading_cleaned_data('english-french-both.pkl')
train = loading_cleaned_data('english-french-train.pkl')
test = loading_cleaned_data('english-french-test.pkl')

## Creating a tokenizer for the lines and finding the maximum length phrase

In [13]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

## Size of English & French vocabulary and their max phrase length

In [14]:
# preparing the english tokenizer

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# preparing the french tokenizer

fra_tokenizer = create_tokenizer(dataset[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(dataset[:, 1])
print('French Vocabulary Size: %d' % fra_vocab_size)
print('French Max Length: %d' % (fra_length))

English Vocabulary Size: 3418
English Max Length: 5
French Vocabulary Size: 6977
French Max Length: 11


## Encoding to integers and padding to the maximum phrase length

In [15]:
# Input and Output sequence must be encoded to integers and padded to the maximum phrase length
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    x = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    x = pad_sequences(x, maxlen=length, padding='post')
    return x

# One hot encoding to max phrase length
def one_hot_encoding(sequences, vocab_size):
    y_1 = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        y_1.append(encoded)
    y = array(y_1)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

## Training and Testing Data

In [16]:
# preparing training data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = one_hot_encoding(trainY, eng_vocab_size)

# prepare testing data
testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer,eng_length, test[:, 0])
testY = one_hot_encoding(testY, eng_vocab_size)

In [17]:
print('training size:',trainX.shape,trainY.shape)
print('testing size:',testX.shape,testY.shape)

training size: (18000, 11) (18000, 5, 3418)
testing size: (2000, 11) (2000, 5, 3418)


## Building the model

In [18]:
def model_building(source_vocab, target_vocab, source_len, target_len, units):
    model = Sequential()
    model.add(Embedding(source_vocab, units, input_length=source_len, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(target_len))
    model.add(LSTM(units, return_sequences=True))
    model.add(TimeDistributed(Dense(target_vocab, activation='softmax')))
    return model

## Defining and Compiling the model

In [19]:
model = model_building(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 512)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['acc'])

## Model Summary

In [20]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11, 512)           3572224   
                                                                 
 lstm (LSTM)                 (None, 512)               2099200   
                                                                 
 repeat_vector (RepeatVector  (None, 5, 512)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 512)            2099200   
                                                                 
 time_distributed (TimeDistr  (None, 5, 3418)          1753434   
 ibuted)                                                         
                                                                 
Total params: 9,524,058
Trainable params: 9,524,058
Non-

In [21]:
# Stop model if accuracy of the model doesn't changes by more than 0.01 
# Patience = 5 : After each 5 epochs if no improvement is there then training will be stopped.
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_acc',patience= 5,min_delta=0.01)

## Fitting the model

1.Epochs = 50

2.Batch_size = 25

In [22]:
# fit model
model.fit(trainX, trainY, epochs= 50, batch_size=25, validation_data=(testX, testY), verbose=2,callbacks=[es])

Epoch 1/50
720/720 - 194s - loss: 3.6283 - acc: 0.4596 - val_loss: 3.1087 - val_acc: 0.5107 - 194s/epoch - 269ms/step
Epoch 2/50
720/720 - 190s - loss: 2.7595 - acc: 0.5493 - val_loss: 2.5955 - val_acc: 0.5834 - 190s/epoch - 264ms/step
Epoch 3/50
720/720 - 190s - loss: 2.2167 - acc: 0.6061 - val_loss: 2.2646 - val_acc: 0.6197 - 190s/epoch - 264ms/step
Epoch 4/50
720/720 - 190s - loss: 1.7816 - acc: 0.6512 - val_loss: 2.0586 - val_acc: 0.6447 - 190s/epoch - 264ms/step
Epoch 5/50
720/720 - 190s - loss: 1.4156 - acc: 0.6977 - val_loss: 1.8979 - val_acc: 0.6661 - 190s/epoch - 264ms/step
Epoch 6/50
720/720 - 196s - loss: 1.0952 - acc: 0.7466 - val_loss: 1.7770 - val_acc: 0.6898 - 196s/epoch - 272ms/step
Epoch 7/50
720/720 - 202s - loss: 0.8333 - acc: 0.7957 - val_loss: 1.7125 - val_acc: 0.7027 - 202s/epoch - 280ms/step
Epoch 8/50
720/720 - 222s - loss: 0.6288 - acc: 0.8377 - val_loss: 1.6763 - val_acc: 0.7135 - 222s/epoch - 308ms/step
Epoch 9/50
720/720 - 214s - loss: 0.4815 - acc: 0.8712 -

<keras.callbacks.History at 0x20ac64bd160>

## Evaluating model and calculating BLEU Score

Evaluation involves two steps:

1.Generating a translated output sequence, and

2.then repeating this process for many input examples and summarizing the skill of the model across multiple cases.

In [23]:
# mapping integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [24]:
# generating target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [25]:
# evaluating the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  
    # Creating empty lists for actual phrases(French) and predicted phrases(English) 
    actual,predicted = list(),list()
    a,b,c = list(),list(),list()
    for i,source in enumerate(sources):
        # reshaping to the required size
        source = source.reshape((1, source.shape[0]))

        # predicting for the english tokenizer
        translation = predict_sequence(model, eng_tokenizer, source)
        # raw_dataset = raw_dataset[i].split(' ') 
        # print(raw_dataset[i][1])

        raw_src,raw_target = raw_dataset[i][1],raw_dataset[i][0]
    
        # First 10 Predictions
        if i <= 10:
            print('source = ',raw_src,'<--->', ' target = ',raw_target,'<--->','  predicted = ',translation)

        actual.append([raw_target.split()])
        predicted.append(translation.split())
  
    # calculating BLEU score
    print('-------------------------------------------')
    print('BLEU Score :')
    print('BLEU score-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie,auto_reweigh=False))


## Evaluating Model on training data

In [26]:
evaluate_model(model,eng_tokenizer,trainX,train)

source =  on se voit demain <--->  target =  see you tomorrow <--->   predicted =  see you tomorrow
source =  je vais aller a heures <--->  target =  i will go at <--->   predicted =  i will go
source =  recommencez <--->  target =  start over <--->   predicted =  start over
source =  prenez de la pizza <--->  target =  have some pizza <--->   predicted =  have some pizza
source =  cetait net <--->  target =  it was clean <--->   predicted =  it was clean
source =  tom vient juste de venir <--->  target =  tom just came <--->   predicted =  tom just came
source =  cest un tenancier de bar <--->  target =  hes a bartender <--->   predicted =  hes a bartender
source =  tom a bien combattu <--->  target =  tom fought well <--->   predicted =  tom fought well
source =  emmene la cle <--->  target =  bring the key <--->   predicted =  bring the key
source =  nous sommes populaires <--->  target =  were popular <--->   predicted =  were popular
source =  cest mon destin <--->  target =  this

## Evaluating Model on testing data

In [27]:
evaluate_model(model, eng_tokenizer, testX, test)

source =  puisje avoir un calin <--->  target =  can i have a hug <--->   predicted =  can i go this
source =  descends <--->  target =  come on down <--->   predicted =  get down
source =  jai demande a tom <--->  target =  i asked tom <--->   predicted =  i called tom
source =  je vous ferai un proces <--->  target =  ill sue you <--->   predicted =  i will you you
source =  cest faisable <--->  target =  thats doable <--->   predicted =  this is doable
source =  je vous ai desobei <--->  target =  i disobeyed you <--->   predicted =  i disobeyed you
source =  estce un elan <--->  target =  is it an elk <--->   predicted =  is that a a
source =  je vous en prie <--->  target =  you go first <--->   predicted =  i beg you
source =  ne le niez pas <--->  target =  dont deny it <--->   predicted =  dont deny it
source =  les hommes sont des idiots <--->  target =  men are idiots <--->   predicted =  men are pigs
source =  je leur souhaite bonne chance <--->  target =  i wish them luck <