In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spa.txt


In [2]:
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

# Reading the text file of Spanish-English pairs:-

In [3]:
lines=pd.read_table("/kaggle/input/spa.txt",names=['input','target','comments'])
lines.head(10)

Unnamed: 0,input,target,comments
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
5,Run!,¡Corre!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
6,Run!,¡Corran!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
7,Run!,¡Corra!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
8,Run!,¡Corred!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
9,Run.,Corred.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [4]:
lines=lines[['input','target']]
lines

Unnamed: 0,input,target
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
124320,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
124321,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
124322,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
124323,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


# We have separated the English and Spanish Sentences..

In [5]:
lines.sample(5)

Unnamed: 0,input,target
98091,The seventh day of the week is Saturday.,El séptimo día de la semana es el sábado.
42601,I told you I'm not ready.,Te dije que no estoy listo.
11095,I don't know you.,No te conozco.
14120,I hope she's safe.,Espero que esté a salvo.
111671,May I talk with you in private about the matter?,¿Puedo hablarte a solas acerca del asunto?


# Now preprocessing the statements:-

In [6]:
def preprocess_sentence(sentence):
    
    num_digits= str.maketrans('','', digits)
    
    sentence= sentence.lower()
    sentence= re.sub(" +", " ", sentence)
    sentence= re.sub("'", '', sentence)
    sentence= sentence.translate(num_digits)
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'START_ ' + sentence + ' _END'
    
    return sentence

In [7]:
lines['input']=lines['input'].apply(preprocess_sentence)

In [8]:
lines['input']

0                                          START_ go . _END
1                                          START_ go . _END
2                                          START_ go . _END
3                                          START_ go . _END
4                                          START_ hi . _END
                                ...                        
124320    START_ there are four main causes of alcohol-r...
124321    START_ there are mothers and fathers who will ...
124322    START_ a carbon footprint is the amount of car...
124323    START_ since there are usually multiple websit...
124324    START_ if you want to sound like a native spea...
Name: input, Length: 124325, dtype: object

In [9]:
lines['target']=lines['target'].apply(preprocess_sentence)

In [10]:
lines['target']

0                                          START_ ve . _END
1                                        START_ vete . _END
2                                        START_ vaya . _END
3                                      START_ váyase . _END
4                                        START_ hola . _END
                                ...                        
124320    START_ hay cuatro causas principales de muerte...
124321    START_ hay madres y padres que se quedan despi...
124322    START_ una huella de carbono es la cantidad de...
124323    START_ como suele haber varias páginas web sob...
124324    START_ si quieres sonar como un hablante nativ...
Name: target, Length: 124325, dtype: object

In [11]:
len(lines)

124325

In [12]:
rows = lines.to_numpy().tolist()

In [13]:
rows = np.array(rows)
rows

array([['START_ go . _END', 'START_ ve . _END'],
       ['START_ go . _END', 'START_ vete . _END'],
       ['START_ go . _END', 'START_ vaya . _END'],
       ...,
       ['START_ a carbon footprint is the amount of carbon dioxide pollution that we produce as a result of our activities .  some people try to reduce their carbon footprint because they are concerned about climate change . _END',
        'START_ una huella de carbono es la cantidad de contaminación de dióxido de carbono que producimos como producto de nuestras actividades .  algunas personas intentan reducir su huella de carbono porque están preocupados acerca del cambio climático . _END'],
       ['START_ since there are usually multiple websites on any given topic ,  i usually just click the back button when i arrive on any webpage that has pop-up advertising .  i just go to the next page found by google and hope for something less irritating . _END',
        'START_ como suele haber varias páginas web sobre cualquier tem

# Made English Spanish Pairs..

In [14]:
english=[]
for i in lines['input']:
    english.append(i)

In [15]:
len(english)

124325

In [16]:
spanish=[]
for i in lines['target']:
    spanish.append(i)

In [17]:
len(spanish)

124325

# Creating the input and target tokens:-

In [18]:
input_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
input_sentence_tokenizer.fit_on_texts(english)
input_array = input_sentence_tokenizer.texts_to_sequences(english)
input_array= tf.keras.preprocessing.sequence.pad_sequences(input_array,padding='post' )

In [19]:
input_array

array([[    1,    45,     3, ...,     0,     0,     0],
       [    1,    45,     3, ...,     0,     0,     0],
       [    1,    45,     3, ...,     0,     0,     0],
       ...,
       [    1,     9,  4168, ...,     0,     0,     0],
       [    1,   382,    49, ...,     0,     0,     0],
       [    1,    68,     7, ..., 13628,     3,     2]], dtype=int32)

In [20]:
target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(spanish)
target_array = target_sentence_tokenizer.texts_to_sequences(english)
target_array= tf.keras.preprocessing.sequence.pad_sequences(target_array,padding='post',maxlen=30)

In [21]:
target_array

array([[    1, 20921,     3, ...,     0,     0,     0],
       [    1, 20921,     3, ...,     0,     0,     0],
       [    1, 20921,     3, ...,     0,     0,     0],
       ...,
       [    1,     7,  4211, ...,     0,     0,     0],
       [    1,    18, 21985, ...,     0,     0,     0],
       [    1, 26229,     7, ...,     0,     0,     0]], dtype=int32)

In [22]:
print(len(target_array[0]))

30


In [23]:
max_target_length= max(len(t) for t in  target_array)
print(max_target_length)
max_source_length= max(len(t) for t in  input_array)
print(max_source_length)

30
51


# Padding the sentences to a certain length::-

In [24]:
input_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
input_sentence_tokenizer.fit_on_texts(english)
input_array = input_sentence_tokenizer.texts_to_sequences(english)
input_array= tf.keras.preprocessing.sequence.pad_sequences(input_array,padding='post',maxlen=20)

In [25]:
target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_sentence_tokenizer.fit_on_texts(spanish)
target_array = target_sentence_tokenizer.texts_to_sequences(english)
target_array= tf.keras.preprocessing.sequence.pad_sequences(target_array,padding='post',maxlen=30)

# Train Test Split:-

In [26]:
input_train, input_val, target_train, target_val = train_test_split(input_array, target_array, test_size=0.2)

print(len(input_train), len(target_train), len(input_val), len(target_val))

99460 99460 24865 24865


# 80-20 Split

In [27]:
print(len(input_sentence_tokenizer.word_index)+1)

13629


In [28]:
print(len(target_sentence_tokenizer.word_index)+1)

26832


When the dataset is big, we want to create the dataset in memory to be efficient. We will use tf.data.Dataset.from_tensor_slices() method to get slices of the array in the form of an object.

In [29]:
buffer_size = len(input_train)
batch_size = 64
steps_per_epoch = len(input_train)//batch_size
embedding_dim = 256
units = 1024
vocab_input_size = len(input_sentence_tokenizer.word_index)+1
vocab_target_size = len(target_sentence_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)
# print(type(dataset))

# Encoder-Decoder Architecture:-

In [30]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,units,batchsize):
        super(Encoder, self).__init__()
        self.batchsize=batchsize
        self.units=units
        self.embedding=tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru=tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True)# entire sequence of outputs will be returned from all the units.
        #To return the internal state of GRU, we set the return_state to True
    
    def call(self,y,hidden):
        y=self.embedding(y)
        output,state=self.gru(y,initial_state=hidden)
        return output,state

    def initialize_hidden_state(self):
        return tf.zeros((self.batchsize, self.units))

In [31]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 20]), TensorShape([64, 30]))

In [32]:
encoder = Encoder(vocab_input_size, embedding_dim, units,batch_size )

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 20, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


# Creating a Bahdanau Attention Layer:-

Attention layer consists:-

Alignment Score

Attention weights

Context vector

In [33]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    # The encoder hiiden states are taken as input to the attention layer which are of shape (batch size, units)
    # and the encoder output of each timestep is of shape (batch size, sequence length, units).
    # so for adding we have to expand dimensions
    def call(self,encoder_out,encoder_hid):
        
       hidden1=tf.expand_dims(encoder_hid,1)
        
        
       # score shape == (batch_size, max_length, 1)
       # we get 1 at the last axis because we are applying score to self.V
       # the shape of the array before applying self.V is (batch_size, max_length, units)
       score = self.V(tf.nn.tanh(
          self.W1(encoder_out) + self.W2(hidden1)))

       # attention_weights shape == (batch_size, max_length, 1)
       attention_weights = tf.nn.softmax(score, axis=1) ## the alignment scores for each encoder hidden state
        #are combined and represented in a single vector and subsequently softmaxed

       # context_vector shape after sum == (batch_size, hidden_size)
       context_vector = attention_weights * encoder_out ## attention weights multiplied with the encoder output states are used to calculate the context vactor
        
       context_vector = tf.reduce_sum(context_vector, axis=1)

       return context_vector, attention_weights ## returning the context vector and the attention_weights

In [34]:
attention_layer = Attention(5)# 5 for units of attention
attention_result, attention_weights = attention_layer(sample_output,sample_hidden)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 20, 1)


# The context vector should be of the shape of (batch size, units) as it be combined with the decoder previous embeddings..

# Decoder Class:-

In [56]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_size,units,batchsize):
        super(Decoder, self).__init__()
        self.batchsize=batchsize
        self.units=units
        self.embedding=tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru=tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True)# entire sequence of outputs will be returned from all the units.
        #To return the internal state of GRU, we set the return_state to True
        
        #fully connected layer for the decoder outputs
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # attention layer
        self.attention=Attention(self.units)
        
    def call(self, x, enc_output,hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(enc_output,hidden)
#         print(context_vector.shape)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
#         print(x.shape)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)## context vaector is added with the previous decoder hidden state.
       
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights       

In [57]:
decoder = Decoder(vocab_target_size, embedding_dim, units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_output, sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

(64, 1024)
(64, 1, 256)
Decoder output shape: (batch_size, vocab size) (64, 26832)


# Defining the Optimiser and Loss Function:-

In [59]:
optimizer=tf.keras.optimizers.RMSprop()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)