In [5]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2021-08-18 13:37:14--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 172.67.186.54, 104.21.92.44, 2606:4700:3030::6815:5c2c, ...
Connecting to www.manythings.org (www.manythings.org)|172.67.186.54|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6451478 (6.2M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2021-08-18 13:37:14 (8.56 MB/s) - ‘fra-eng.zip.1’ saved [6451478/6451478]



In [6]:
!unzip fra-eng.zip

Archive:  fra-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: _about.txt              
replace fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: fra.txt                 


In [7]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [8]:
path_to_file="fra.txt"

In [9]:
#to convert unicode to ascii
def unicode_to_ascii(s):
    text = s.encode('utf-8').decode('utf-8')
    return text
#preprcessing with turning specific characters
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w    

In [10]:
en_sentence = u"Here's my account number."
hn_sentence = u"Voici mon numéro de compte."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(hn_sentence))

<start> here's my account number . <end>
<start> voici mon numéro de compte . <end>


In [11]:
"""
1.Remove the accents
2.clean the sentences
3.Return the sentense in this seq[English,French]
"""
def create_dataset(path,num_examples):
    lines=io.open(path,encoding='UTF-8').read().strip().split('\n')
    
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return zip(*word_pairs)

In [12]:
en,fr,un = create_dataset(path_to_file,None)
print(en[-1])
print(fr[-1])

<start> it may be impossible to get a completely error-free corpus due to the nature of this kind of collaborative effort . however , if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning , we might be able to minimize errors . <end>
<start> il est peut-être impossible d'obtenir un corpus complètement dénué de fautes , étant donnée la nature de ce type d'entreprise collaborative . cependant , si nous encourageons les membres à produire des phrases dans leurs propres langues plutôt que d'expérimenter dans les langues qu'ils apprennent , nous pourrions être en mesure de réduire les erreurs . <end>


In [13]:
def tokenize(lang):
    lang_tokenizer=tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)
    tensors = lang_tokenizer.texts_to_sequences(lang)
    tensors = tf.keras.preprocessing.sequence.pad_sequences(tensors,padding='post')
    
    return tensors,lang_tokenizer

In [14]:
#defning the load datase function that will carry all the above three functions:)
def load_dataset(path,num_examples=None):
    inp_lang,targ_lang,unwanted = create_dataset(path,num_examples)
    
    input_tensor,input_language_tokenizer = tokenize(inp_lang)
    target_tensor,targ_lang_tokenizer = tokenize(targ_lang)
    
    return input_tensor,target_tensor,input_language_tokenizer,targ_lang_tokenizer

In [41]:
"""
limiting the examples so that training can be faster
there are >100000 sentences in data set to compile we are selecting
50000 but compromising the the quality 
TODO change the num example to None in releasing patterns
"""
#taking 50000 samples for fast traning
num_examples=50000
input_tensor,target_tensor,inp_lang,targ_lang = load_dataset(path_to_file,num_examples)

#max length of target tensors
max_length_targ,max_length_inp = target_tensor.shape[1],input_tensor.shape[1]
print(max_length_targ)
print(max_length_inp)
print(inp_lang.word_index)
print(targ_lang)

18
11
<keras_preprocessing.text.Tokenizer object at 0x7f247342ce50>


In [42]:
#creating bacthes for training and validation 
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

40000 40000 10000 10000


In [43]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [18]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
117 ----> what's
11 ----> the
726 ----> message
5 ----> ?
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
136 ----> quel
12 ----> est
14 ----> le
3934 ----> message 
5 ----> ?
2 ----> <end>


In [19]:
BUFFER_SIZE=len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedin_dim = 256
units=1024
vocab_inp_size=len(inp_lang.word_index)+1
vocab_tar_size=len(targ_lang.word_index)+1
print(vocab_inp_size)
print(vocab_tar_size)
print(BUFFER_SIZE)


dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)

6109
14075
40000


In [20]:
example_input_batch,example_target_batch = next(iter(dataset))
example_input_batch.shape,example_target_batch.shape

(TensorShape([64, 11]), TensorShape([64, 18]))

In [21]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_sz):
        super(Encoder,self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedin_dim)
        self.gru = tf.keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        
    def call(self,x,hidden):
        #x is our input
        x=self.embedding(x)
        output,state = self.gru(x,initial_state=hidden)
        return output,state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

In [22]:
encoder = Encoder(vocab_inp_size,embedin_dim,units,BATCH_SIZE)

In [23]:
#sample to check the layers are working 
sample_hidden = encoder.initialize_hidden_state()
sample_output,sample_hidden = encoder(example_input_batch,sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 11, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


# I am Using the Bahdanau Attention for encoding the parameters are:-
* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder
* The pseudo code for each is:-
* 1)score = FC(tanh(FC(EO) + FC(H)))
* 2)attention weights = softmax(score, axis = 1)
* 3)context vector = sum(attention weights * EO, axis = 1)
* 4)embedding output = It is got from the input passed to the Decoder Embedding Layer.
* 5)merged vector = concat(embedding output, context vector)
* note:-
* This merged vector is then passed to gru layer as hidden state
* note:-
* axis=1 is there because we have to make change accross the max_len field so we took axis 1

In [24]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.w1=tf.keras.layers.Dense(units)
        self.w2=tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self,query,values):
        #we will be doing it to get addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query,1)
        
        #shape of the score will be(batchsize,maxlength,1)
        #1 is because it is passing through the final dense layer having units ==1
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score,axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector=attention_weights * values
        context_vector=tf.reduce_sum(context_vector,axis=1)
        
        return context_vector,attention_weights

In [25]:
attention_layer = BahdanauAttention(10)
attention_result,attention_weights=attention_layer(sample_hidden,sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 11, 1)


In [26]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super(Decoder,self).__init__()
        self.batch_sz=batch_sz
        self.dec_units=dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=tf.keras.layers.GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc=tf.keras.layers.Dense(
            vocab_size
        )
        
        #used for attention
        self.attention = BahdanauAttention(self.dec_units)
    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        
        x=self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output,state = self.gru(x)
        output=tf.reshape(output,(-1,output.shape[2]))
        
        x = self.fc(output)
        
        return x,state,attention_weights

In [27]:
decoder = Decoder(vocab_tar_size, embedin_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 14075)


In [28]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,reduction='none'
)
def loss_function(real,pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss_object(real,pred)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *=mask
    
    return loss_

In [29]:
checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [30]:
@tf.function
def train_step(inp,targ,enc_hidden):
    loss=0
    #gradient tape is typically diffrention for complex tensors
    with tf.GradientTape() as tape:
        enc_output,enc_hidden=encoder(inp,enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        
        #using teacher forcing method to feed the target as next input
        for t in range(1,targ.shape[1]):
            #passing the enc_output to the decoder
            predictions,dec_hidden,_=decoder(dec_input,dec_hidden,enc_output)
            
            loss +=loss_function(targ[:,t],predictions)
            
            dec_input = tf.expand_dims(targ[:,t],1)
        batch_loss = (loss/int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss,variables)
        
        optimizer.apply_gradients(zip(gradients,variables))
        return batch_loss

In [31]:
EPOCHS=10
for epoch in range(EPOCHS):
    start =time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss=0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp,targ,enc_hidden)
        total_loss+=batch_loss
        if batch %100==0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy()}')
    if(epoch+1)%2==0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')    

Epoch 1 Batch 0 Loss [5.837497  2.6531856 4.2450824 4.245537  2.122713  3.1840444 3.1837907
 3.7147496 3.7148361 1.5922009 3.7153466 3.1842995 3.1844156 3.7149591
 2.1229002 3.1841407 3.7154355 2.1230056 3.7153313 2.6535425 3.714708
 3.1840699 2.6534212 4.2453117 5.307643  3.7152667 3.7149036 2.6535983
 4.7762322 3.1847517 3.714926  2.6536958 3.7145674 2.653354  2.1225865
 4.2459936 2.6530106 3.7150438 1.5923212 3.1841862 2.6538885 3.1838129
 3.1839268 2.6533082 3.7149022 3.183662  3.1840181 2.6535897 4.2454233
 2.6532261 3.184066  2.653236  3.1838372 1.5922056 3.7148726 3.1840603
 3.7149134 2.6534066 2.6534772 3.1839325 3.7145076 2.653475  2.1227167
 3.184739 ]
Epoch 1 Batch 100 Loss [1.4213924  2.116158   1.4574189  1.0247284  1.4611018  1.103369
 1.5620304  0.94940263 1.518529   1.4542809  2.183061   1.1603969
 1.6230484  1.5375245  1.0882508  2.3778205  1.5599217  1.2781733
 1.296487   1.5415001  1.4695408  1.6050098  1.2004621  1.4313941
 1.852975   1.7475029  1.7176414  1.9606192

In [44]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ,max_length_inp))
    
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs=tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
        #storing the attention weights
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    
    return result, sentence, attention_plot
    

In [45]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [46]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [47]:
#restore to the latest checkpoint
print(tf.train.latest_checkpoint(checkpoint_dir))
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

training_checkpoints/ckpt-5


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f24cb6b3ed0>

In [48]:
translate(u'i love you.')

Input: <start> i love you . <end>
Predicted translation: je t'aime ! <end> 


In [49]:
translate(u"I'm not going anywhere.")

Input: <start> i'm not going anywhere . <end>
Predicted translation: je ne vais pas manger . <end> 


In [50]:
translate(u'You are a good person.')

Input: <start> you are a good person . <end>
Predicted translation: tu es une bonne . <end> 


In [51]:
translate(u'Did you miss me?')

Input: <start> did you miss me ? <end>
Predicted translation: tu me manques loupé ? <end> 


In [52]:
translate(u'He came running.')

Input: <start> he came running . <end>
Predicted translation: il vint en courant . <end> 
