In [1]:
#Import the required libraries
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense


Read the file that contains the English-Spanish translations 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Path to the data txt file on disk.
data_path = "/content/drive/MyDrive/Colab Notebooks/TUKL summer internship/week1/spa.txt"
# open the file eng-spa.txt and read
lines= pd.read_table(data_path,  names =['source', 'target', 'comments'])
#printing sample data from lines
lines.sample(6)

Unnamed: 0,source,target,comments
38128,It's just the opposite.,Es justo lo opuesto.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
48055,Please don't tell anyone.,"No se lo digas a nadie, por favor.",CC-BY 2.0 (France) Attribution: tatoeba.org #1...
18454,Are you a criminal?,¿Eres un criminal?,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
39985,We're having breakfast.,Estamos desayunando.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
66210,I was tired so I went to bed.,"Como estaba cansado, me fui a la cama.",CC-BY 2.0 (France) Attribution: tatoeba.org #1...
59577,A terrible fate awaited him.,Le esperaba un terrible destino.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


We apply the following text cleaning
Convert text to lower case
Remove quotes
Remove all special characters like “@, !, *, $, #, ?, %, etc.”
Clean digits from the source and target sentences. If the source or the target language use different symbols for the numbers, then remove those symbols
Remove spaces

In [4]:
# convert source and target text to Lowercase 
lines.source=lines.source.apply(lambda x: x.lower())
lines.target=lines.target.apply(lambda x: x.lower())
# Remove quotes from source and target text
lines.source=lines.source.apply(lambda x: re.sub("'", '', x))
lines.target=lines.target.apply(lambda x: re.sub("'", '', x))
# create a set of all special characters
special_characters= set(string.punctuation)
# Remove all the special characters
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))
# Remove digits from source and target sentences
num_digits= str.maketrans('','', digits)
lines.source=lines.source.apply(lambda x: x.translate(num_digits))
lines.target= lines.target.apply(lambda x: x.translate(num_digits))
# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

Add the START_ and the _END tags to the target sentences.
Adding the START_ and the _END token to the target sentences is very useful for training and during inference. These tags help to know when to start the translation and when to end the translation.

In [5]:
# Add start and end tokens to target sequences
lines.target = lines.target.apply(lambda x : 'START_ '+ x + ' _END')
lines.sample(6)

Unnamed: 0,source,target,comments
69210,describe your ideal breakfast,START_ describí tu desayuno ideal _END,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
38727,that bridge isnt long,START_ ese puente no es largo _END,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
16253,ive called twice,START_ llamé dos veces _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
63133,tom finally gave up smoking,START_ tom finalmente dejó de fumar _END,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
66179,i want you to open your eyes,START_ quiero que abras los ojos _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
40002,were very discouraged,START_ estamos muy desanimados _END,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


Create a set of unique words both for source and target language from the dataset and sort them alphabetically

In [6]:
# Find all the source and target words and sort them
# Vocabulary of Source language
all_source_words=set()
for source in lines.source:
    for word in source.split():
        if word not in all_source_words:
            all_source_words.add(word)
# Vocabulary of Target 
all_target_words=set()
for target in lines.target:
    for word in target.split():
        if word not in all_target_words:
            all_target_words.add(word)
# sort all unique source and target words
source_words= sorted(list(all_source_words))
target_words=sorted(list(all_target_words))

Find the maximum length of the source and target sentences in the dataset

In [7]:
#Find maximum sentence length in  the source and target data
source_length_list=[]
for l in lines.source:
    source_length_list.append(len(l.split(' ')))
max_source_length= max(source_length_list)
print(" Max length of the source sentence",max_source_length)
target_length_list=[]
for l in lines.target:
    target_length_list.append(len(l.split(' ')))
max_target_length= max(target_length_list)
print(" Max length of the target sentence",max_target_length)

 Max length of the source sentence 70
 Max length of the target sentence 70


Create a word to index dictionary and an index to word dictionary for all unique source and target words in the dataset.
Size of the word to vector will be based on the length of the source and target vocabulary

In [8]:
# creating a word to index(word2idx) for source and target
source_word2idx= dict([(word, i+1) for i,word in enumerate(source_words)])
target_word2idx=dict([(word, i+1) for i, word in enumerate(target_words)])

In [9]:
#creating a dictionary for index to word for source and target vocabulary
source_idx2word= dict([(i, word) for word, i in  source_word2idx.items()])
print(source_idx2word)
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])



Shuffle the data
Shuffling helps with
Reducing variance
Ensures models remain generic and overfit less
Batches between epochs do not look alike
Makes model more robust

In [10]:
#Shuffle the data
lines = shuffle(lines)

In [11]:
# Train - Test Split
X, y = lines.source, lines.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((121262,), (13474,))

Create data for training the encoder-decoder model.
We will use fit_generator() instead of the fit() method as our data is too large to fit into the memory. fit_generator() needs an underlying function to generate the data.

We create the underlying function generate_batch() for generating data in batches
The fit_generator() will accept a batch of data from the underlying function, generate_batch()

To train a sequence to sequence model, we need to create one-hot encoded data for
encoder inputs: The 2D array will be of shape (batch_size, max source sentence length). For a batch_size of 128 and a max source sentence length of 47, the shape of encoder_input will be (128,47)
decoder inputs: The 2D array will be of shape (batch_size, max target sentence length). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder inputs will be (128,55)
decoder outputs: The 3D array will be of shape (batch_size, max target sentence length, number of unique words in target sentences). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder output will be (128,55, 27200).
Number of unique words in the target_sentence is 27199 which we zero pad, and hence the third parameter in decoder output is 27200

In [12]:
# Input tokens for encoder
num_encoder_tokens=len(source_words)
# Input tokens for decoder zero padded
num_decoder_tokens=len(target_words) +1

In [13]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
      for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_source_length),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_target_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_target_length, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
              for t, word in enumerate(input_text.split()):
                encoder_input_data[i, t] = source_word2idx[word]
                for t, word in enumerate(target_text.split()):
                  if t<len(target_text.split())-1:
                    decoder_input_data[i, t] =target_word2idx[word] # decoder input seq
                    if t>0:
                       decoder_target_data[i, t - 1,target_word2idx[word]] = 1.
                       yield([encoder_input_data, decoder_input_data],decoder_target_data)

We will use Teacher Forcing to train the sequence to sequence model for faster and efficient training of the decoder.
Teacher forcing algorithm trains decoder by supplying the actual output of the previous timestamp instead of the predicted output from the last time step as inputs during training

Decoder learns to generate a word at t+1 timestep, taking into account the actual output at time step t and the encoder’s internal state; hence we offset the decoder output by one timestep

Build the sequence to sequence model
Setup basic parameters
We set the necessary parameters like
number of training samples
number of validation samples
batch_size used for creating the training data
Epochs to train on
The latent dimension of the encoding space

In [14]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50
latent_dim=256

Building the Model
Build the encoder and decoder using LSTM. The encoder will encode the input sentences of the source language. Hidden state and the cell state of the encoder will be passed as input to the decoder along with actual target sequences.

The encoder will encode the input sequence. We pass the input through the input layer. The first hidden layer will be the embedding layer. Embeddings translate large sparse vectors into a dense lower-dimensional space preserving the semantic relationships.
Pass three parameters to Embedding(); the first parameter is the size of the vocabulary; the second parameter is the dimension of the dense Embedding. We set mask_zero as True as this implies that the input value of 0 is a special “padding” value that should be masked out.
Create the LSTM layer and only set return_state to True as we want to retain the hidden state and cell state of the encoder. We discard the encoder_output and preserve the hidden state and cell state of the LSTM to be passed to the decoder


In [15]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Build the Decoder
We create an input layer for the decoder_inputs; Embedding is again the first hidden layer in the decoder.
The LSTM layer will return output sequences as well as the internal states. The internal states will be used only during the inference phase and will not be used during the training phase.
LSTM in the decoder takes input from the embedding layer and the encoder states. We apply a softmax activation to the Dense layer and then finally generate the decoder outputs

In [16]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [17]:
# Define the model that takes encoder and decoder input 
# to output decoder_outputs
from keras.models import Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [18]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [19]:
train_samples = len(X_train) # Total Training samples
val_samples = len(X_test)    # Total validation or test samples
batch_size = 128
epochs = 5

In [20]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9de9cd5750>

In [20]:
#inference
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [24]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = target_word2idx['START_']
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
      # Sample a token
      sampled_token_index = np.argmax(output_tokens[0, -1, :])
      sampled_word =target_idx2word[sampled_token_index]
      decoded_sentence += ' '+ sampled_word
      if (sampled_word == '_END' or len(decoded_sentence) > 50):
        stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
        return decoded_sentence

In [25]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [29]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [30]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_train[k:k+1].values[0])
print('Actual Target Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

Input Source sentence: i was born yesterday and will die tomorrow
Actual Target Translation:  nací ayer moriré mañana 
Predicted Target Translation:  de de de de de de de de de de de de de de de d


In [31]:
test_gen = generate_batch(X_test, y_test, batch_size = 1)
k=10
k+=1
(input_seq, actual_output), _ = next(test_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_test[k:k+1].values[0])
print('Actual Target Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])


Input Source sentence: this dog barks a lot
Actual Target Translation:  este perro ladra mucho 
Predicted Target Translation:  no no no no no no no no no no no no no no no n


Validation not good at all and attention mechanism not implemented towards data science link