### This notebook trains and evaluates an LSTM model for 250000 sentences for 20 epochs

In [1]:
import re

import numpy as np

from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model, load_model
from keras.losses import SparseCategoricalCrossentropy
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, TerminateOnNaN

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

from datasets import load_dataset

import matplotlib.pyplot as plt

# Import the Util file
import util as util

  from .autonotebook import tqdm as notebook_tqdm


### This loads and preprocesses the data - making sure that only the unique sentences are stored to avoid any repitition

In [2]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")
data = raw_datasets["train"]["translation"]
    
eng_sen = []
hin_sen = []

for item in data:
  eng_sen.append(item['en'])
  hin_sen.append(item['hi'])
    
eng_sen = [util.preprocess(en) for en in eng_sen]
hin_sen = [re.sub('[a-zA-Z]', '', util.preprocess(hi)) for hi in hin_sen]

# Remove duplicate sentences
eng_sen, hin_sen = util.unique_sentences(eng_sen, hin_sen)



In [3]:
total_sentences = 250000
max_len = 10
epochs = 20
val_split = 0.05

In [4]:
en_data = []
hi_data = []

cnt = 0

for (en, hi) in zip(eng_sen, hin_sen):
  l = min(len(en.split()), len(hi.split()))
  if l <= max_len:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

hi_data = ['<START> ' + hi + ' <END>' for hi in hi_data]

### This creates the tokenizer for both languages using Keras Tokenizer

In [5]:
en_tokenizer = Tokenizer(filters='', oov_token='', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = Tokenizer(filters='', oov_token='', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

en_vocab_size = len(en_tokenizer.word_index) + 1
hi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size:", en_vocab_size)
print("Hindi Vocab Size:", hi_vocab_size)

English Vocab Size: 78052
Hindi Vocab Size: 81041


### This sets up the data for the encoders and decoders using Keras pad_sequences 

In [6]:
# Prepare Encoder
encoder_inputs = pad_sequences(en_sequences, maxlen=max_len, padding='post')

# Prepare Decoder
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
    decoder_inputs.append(hi[:-1])
    decoder_outputs.append(hi[1:])

decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_len, padding='post')
decoder_outputs = pad_sequences(decoder_outputs, maxlen=max_len, padding='post')

In [7]:
# Training and Testing split: 95%, 5%
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(237500, 10) (237500, 10) (237500, 10)


### This creates the LSTM model of 256 units with the Encoders and Decoders

In [8]:
# Define LSTM model
num_units = 256

# Encoder
inputs = Input(shape=(None,))
input_embed = Embedding(en_vocab_size, num_units, mask_zero=True)(inputs)
lstm = LSTM(num_units, activation='relu', return_sequences=True, return_state=True)
seq_output, memory_state, carry_state = lstm(input_embed)

# Decoder
targets = Input(shape=(None,))
input_embed = Embedding(hi_vocab_size, num_units, mask_zero=True)(targets)
decoder_lstm = LSTM(num_units, activation='relu', return_sequences=True, return_state=True)
seq_output, memory_state, carry_state = decoder_lstm(input_embed, initial_state=[memory_state, carry_state])

dense = Dense(hi_vocab_size, activation='softmax')
input_embed = dense(seq_output)

model = Model(inputs=[inputs, targets], outputs=input_embed)
model.summary()

loss = SparseCategoricalCrossentropy()
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            1998131   ['input_1[0][0]']             
                                                          2                                       
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            2074649   ['input_2[0][0]']         

### Fits the model and saves it after each epoch

In [None]:
# Save model after each epoch
save_model_callback = ModelCheckpoint(
    filepath='en-hi-50k',
    monitor='val_accuracy',
    mode='max'
)

model.fit(X_train, y_train, epochs=epochs, validation_split=val_split, callbacks=[save_model_callback, TerminateOnNaN()])

### Creates graphs for loss and accuracy (and for validation)

In [None]:
loss = model.history.history['loss']
acc = model.history.history['accuracy']
epoch_count = range(1, len(loss) + 1)

# Visualize graph
plt.plot(epoch_count, loss, 'r--')
plt.plot(epoch_count, acc, 'b-')
plt.legend(['Training Loss', 'Accuracy'])
plt.xlabel('Epoch')  
plt.ylabel('Training Data')
plt.show()

In [None]:
val_acc = model.history.history['val_accuracy']
val_loss = model.history.history['val_loss']

# Visualize graph
plt.plot(epoch_count, val_acc, 'g-')
plt.plot(epoch_count, val_loss)
plt.legend(['Validation Accuracy', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Training Data')
plt.show()

### Load the saved model

In [10]:
# Retrieve previously saved stuff
saved_model = load_model('en-hi-250k')

saved_model.summary()
inputs = saved_model.get_layer('input_1').output
_, memory_state, carry_state = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_2').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense = saved_model.get_layer('dense')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            1998131   ['input_1[0][0]']             
                                                          2                                       
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            2074649   ['input_2[0][0]']         

### This sets up the inference model that will be used when predicting the sentences

In [11]:
# Inference Model
num_units = 256
max_len = 10

# Encoder
encoder = Model(inputs, [memory_state, carry_state])

# Decoder
decoder_input_memory = Input(shape=(num_units,))
decoder_input_carry = Input(shape=(num_units,))
input_embed = embedding_layer(targets)
input_embed, decoder_output_memory, decoder_output_carry = decoder_lstm(input_embed, initial_state=[decoder_input_memory, decoder_input_carry])
input_embed = dense(input_embed)
decoder = Model([targets] + [decoder_input_memory, decoder_input_carry], 
                                [input_embed] + [decoder_output_memory, decoder_output_carry])

### This method predicts the sentence from the model

As it can be seen, the sentence starts and ends with `<START>` and `<END>` tokens.

In [12]:
def predict_sentence(text):
	input_seq = en_tokenizer.texts_to_sequences([text])
	next_memory, next_carry = encoder.predict(input_seq)

	curr_token = np.zeros((1, 1))
	curr_token[0, 0] = hi_tokenizer.word_index['<START>']

	pred_sentence = ''
	next_word = ''
	i = 0
	while next_word != '<END>' and i <= max_len:
		output, next_memory, next_carry = decoder.predict([curr_token] + [next_memory, next_carry])
		next_token = np.argmax(output[0, 0, :])
		next_word = hi_tokenizer.index_word[next_token]
		pred_sentence += ' ' + next_word
		curr_token[0, 0] = next_token
		i += 1

	return pred_sentence

In [13]:
# Testing and Analysis
candidates = []
references = []

for i in range(20):
	cur_len = len(X_test[i].split())
	if cur_len <= max_len:
		print("i =", i)
		pred_sentence = predict_sentence(X_test[i])
		
		candidates.append(pred_sentence.split()[:-1])
		print("Input:", X_test[i])
		print("Prediction:", ' '.join(pred_sentence.split()[:-1]))
		print("Dataset Reference:", ' '.join(y_test[i].split()[1:-1]), "\n")

		references.append([(y_test[i].split()[1:-1])])

i = 0
Input: give your application an accessibility workout
Prediction: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
Dataset Reference: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें 

i = 1
Input: accerciser accessibility explorer
Prediction: एक्सेर्साइसर पहुंचनीयता अन्वेषक
Dataset Reference: एक्सेर्साइसर पहुंचनीयता अन्वेषक 

i = 2
Input: the default plugin layout for the bottom panel
Prediction: निचले के लिए डिफोल्ट प्लगइन
Dataset Reference: निचले पटल के लिए डिफोल्ट प्लगइन खाका 

i = 3
Input: the default plugin layout for the top panel
Prediction: ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference: ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका 

i = 4
Input: a list of plugins that are disabled by default
Prediction: प्लगइन के प्रकार की सूची निष्क्रिय है
Dataset Reference: उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है 

i = 5
Input: highlight duration
Prediction: अवधि को हाइलाइट रकें
Dataset Reference: अवधि को हाइलाइट रकें 

i = 6
Input: the duration of the highlight b

### Finally, computing the BLEU score

In [14]:
smooth = SmoothingFunction()
print("Final BLEU score =", corpus_bleu(references, candidates, smoothing_function=smooth.method7))

Final BLEU score = 0.5204079142754721
