### Loading weights to confirm translation quality

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import string
import re
import json
import copy

In [3]:
df = pd.read_csv('data\eng_spa.csv',names= ['English','Spanish'],index_col =0)

In [4]:
df.head()

Unnamed: 0,English,Spanish
0,Go.,[start] Ve. [end]
1,Go.,[start] Vete. [end]
2,Go.,[start] Vaya. [end]
3,Go.,[start] Váyase. [end]
4,Hi.,[start] Hola. [end]


In [5]:
len(df)

118964

In [6]:
df['Spanish'] = df['Spanish'].apply(lambda x:x.lstrip('[start]'))
df['Spanish'] = df['Spanish'].apply(lambda x:x.rstrip('[end]'))
df['Spanish']

0                                                      Ve. 
1                                                    Vete. 
2                                                    Vaya. 
3                                                  Váyase. 
4                                                    Hola. 
                                ...                        
118959     Hay cuatro causas principales de muertes rela...
118960     Hay madres y padres que se quedan despiertos ...
118961     Una huella de carbono es la cantidad de conta...
118962     Como suele haber varias páginas web sobre cua...
118963     Si quieres sonar como un hablante nativo, deb...
Name: Spanish, Length: 118964, dtype: object

In [7]:
data = list(zip(df['English'],df['Spanish']))
random.seed(42)
random.shuffle(data)
data[:5]

[('How long have you been studying Hungarian?',
  ' ¿Cuánto tiempo has estado estudiando húngaro? '),
 ('Do you really want to be here?', ' ¿Realmente querés estar acá? '),
 ('She is as beautiful as Snow White.', ' Ella es bella como Blancanieves. '),
 ("There are few men who don't know that.",
  ' Hay pocos hombres que no lo saben. '),
 ('Tom changes channels during commercials.',
  ' Tom cambia de canal durante los comerciales. ')]

In [8]:
len_val = int(0.2 * len(data))
len_train = len(data) - len_val
train_pairs = data[:len_train]
val_pairs = data[len_train:len_train + len_val]

print(f"{len(data)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

118964 total pairs
95172 training pairs
23792 validation pairs


### Vectorizing the data

In [9]:
vocab_size = 12000
sequence_length = 20
batch_size = 64

In [10]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-z.?!,¿]", "")
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

english_vectorization =tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    output_mode = 'int',
    ragged=True,
    max_tokens=vocab_size,
    # output_sequence_length = 20
)

spanish_vectorization =tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    output_mode = 'int',
    ragged = True,
    max_tokens=vocab_size,
    # output_sequence_length=21
)
english_data = [x[0] for x in train_pairs]
spanish_data = [x[1] for x in train_pairs]
english_vectorization.adapt(english_data)
spanish_vectorization.adapt(spanish_data)

    

In [11]:
#save the vectorization layers
english_vocab = english_vectorization.get_vocabulary()
    
with open('text_vectorization_files/english_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(english_vocab, f)
    
spanish_vocab = spanish_vectorization.get_vocabulary()
    
with open('text_vectorization_files/spanish_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(spanish_vocab, f)

In [12]:
word_to_id = tf.keras.layers.StringLookup(
    vocabulary = spanish_vocab,
    mask_token = "",
    oov_token = '[UNK]'
)

id_to_word = tf.keras.layers.StringLookup(
    vocabulary = spanish_vocab,
    mask_token = '',
    oov_token = '[UNK]',
    invert = True
)

In [13]:
def tokens_to_text(tokens, id_to_word):
    words = id_to_word(tokens)
    result = tf.strings.reduce_join(words, axis=-1, separator=" ")
    return result

In [14]:
print(english_vocab[:10])
print(spanish_vocab[:10])

['', '[UNK]', '[SOS]', '[EOS]', '.', 'the', 'i', 'to', 'you', 'tom']
['', '[UNK]', '[SOS]', '[EOS]', '.', 'de', 'que', 'a', 'no', 'tom']


In [15]:
sos_id = word_to_id('[SOS]')
eos_id = word_to_id('[EOS]')

In [16]:
def generate_dataset(data, english_vectorization,spanish_vectorization):
    eng_data = [x[0] for x in data]
    
    spa_data = [x[1] for x in data]
 
    dataset = tf.data.Dataset.from_tensor_slices((eng_data,spa_data)).batch(batch_size = batch_size)
    
    return dataset

In [17]:
train_dataset = generate_dataset(train_pairs,english_vectorization,spanish_vectorization)
val_dataset = generate_dataset(val_pairs,english_vectorization,spanish_vectorization)

In [18]:
def preprocess_text(context,target):
    context = english_vectorization(context)
    context = context.to_tensor()
    target = spanish_vectorization(target)
    targ_in = target[:,:-1].to_tensor()
    targ_out = target[:,1:].to_tensor()
    return (context,targ_in),targ_out

In [19]:
train_dataset = train_dataset.map(preprocess_text,tf.data.AUTOTUNE)
val_dataset = val_dataset.map(preprocess_text,tf.data.AUTOTUNE)

In [20]:
for (x,y),z in train_dataset.take(2):
    print(x[:2])
    print(y[:2])
    print(z[:2])

tf.Tensor(
[[   2   55  148   21    8   86  488 5127   11    3    0    0    0    0
     0    0    0    0    0]
 [   2   20    8  121   37    7   35   63   11    3    0    0    0    0
     0    0    0    0    0]], shape=(2, 19), dtype=int64)
tf.Tensor(
[[    2    12   205    61   124   157   729 11280    11     0     0     0
      0     0     0     0     0]
 [    2    12   204   482    96   602    11     0     0     0     0     0
      0     0     0     0     0]], shape=(2, 17), dtype=int64)
tf.Tensor(
[[   12   205    61   124   157   729 11280    11     3     0     0     0
      0     0     0     0     0]
 [   12   204   482    96   602    11     3     0     0     0     0     0
      0     0     0     0     0]], shape=(2, 17), dtype=int64)
tf.Tensor(
[[   2    9   40   32   28  334   61   29 1288   20   34 3682  111    4
     3    0    0    0    0    0    0]
 [   2   17   18   59   25  191    4    3    0    0    0    0    0    0
     0    0    0    0    0    0    0]], shape=(2, 21), d

### Encoder-Decoder Model

In [21]:
vocab_size_1 = 12000
units_1 = 128

#### Encoder

In [22]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size = vocab_size_1,units = units_1):
        super(Encoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.units =units

        self.embedding = tf.keras.layers.Embedding(input_dim = vocab_size,output_dim = units,input_shape = (None,),mask_zero=True)
        self.lstm = tf.keras.layers.Bidirectional(merge_mode='sum',layer = tf.keras.layers.LSTM(units,return_sequences= True))

    def call(self,encoder_inputs):

        embedded_output = self.embedding(encoder_inputs)
        output = self.lstm(embedded_output)
        return output
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "units": self.units
        })
        return config

In [23]:
encoder = Encoder(vocab_size,units_1)

output_1 = encoder(x)

  super().__init__(**kwargs)


In [24]:
output_1.shape

TensorShape([64, 21, 128])

### CrossAttention

In [25]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self,units=units_1):
        super().__init__()

        self.units =units

        self.mha = (tf.keras.layers.MultiHeadAttention(key_dim= units,num_heads=1))
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self,context,target):

        attn_output = self.mha(query = target,value = context)
        x = self.add([target,attn_output])
        x = self.layernorm(x)
        return x
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "units": self.units
        })
        return config

In [26]:
attention =CrossAttention(units_1)
# input = tf.keras.layers.Input(shape =(None,))
target = tf.keras.layers.Embedding(vocab_size_1,units_1,input_shape = (None,None),mask_zero=True)(y)
# input = tf.keras.layers.Input(shape =(None,units_1))
output_2 = attention(output_1,target)



In [27]:
target.shape

TensorShape([64, 20, 128])

In [28]:
output_2.shape

TensorShape([64, 20, 128])

#### Decoder

In [29]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size = vocab_size_1,units = units_1):
        super(Decoder,self).__init__()

        self.vocab_size = vocab_size
        self.units = units

        self.embedding = tf.keras.layers.Embedding(input_dim = vocab_size,output_dim = units,mask_zero=True)
        self.pre_attention_rnn = tf.keras.layers.LSTM(units,return_sequences = True,return_state = True)
        self.attention = CrossAttention(units)
        self.post_attention_rnn = tf.keras.layers.LSTM(units = units,return_sequences=True)
        self.dense = tf.keras.layers.Dense(vocab_size,activation = tf.nn.log_softmax)

    def call(self,context,target,state = None,return_state = False):

        embedding_output = self.embedding(target)
        x,state_h,state_c = self.pre_attention_rnn(embedding_output,initial_state=state)
        x = self.attention(context,x)
        x = self.post_attention_rnn(x)
        logits = self.dense(x)

        if return_state:
            return logits,[state_h,state_c]

        return logits

        
    

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "units": self.units
        })
        return config




In [30]:
decoder = Decoder(vocab_size,units_1)
output= decoder(output_1,y)



In [31]:
output.shape

TensorShape([64, 20, 12000])

#### Translator Model

In [32]:
class Translator(tf.keras.Model):
    def __init__(self,vocab_size =vocab_size_1,units = units_1):
        super().__init__()
        self.encoder = Encoder(vocab_size,units)
        self.decoder = Decoder(vocab_size,units)

    def call(self,inputs):
        context,target = inputs
        encoder_output = self.encoder(context)
        logits = self.decoder(encoder_output,target)

        return logits


In [33]:
translator = Translator(vocab_size_1,units_1)
outputs = translator((x,y))
outputs.shape



TensorShape([64, 20, 12000])

In [35]:
translator.compile(optimizer = 'adam',loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics = ['accuracy'])
translator.summary()

In [37]:
translator.load_weights("model_weights/english_to_spanish.weights.h5")

  saveable.load_own_variables(weights_store.get(inner_path))


In [39]:
def translate(model,text,max_length = 30,beam_width= 1,english_vectorizer = english_vectorization):

    text = tf.convert_to_tensor(text)[tf.newaxis]

    context = english_vectorizer(text).to_tensor()

    context = model.encoder(context)
    state = [tf.zeros((1,units_1)),tf.zeros((1,units_1))]

    end_token = tf.fill((1,1),eos_id)

    done  = False

    sequences = [[[sos_id.numpy()],0.0,state]]

    final_sequences = []
    k = beam_width
    for i in range(max_length):

        if len(final_sequences)<k:

            if len(sequences)>k:
                sequences.sort(key = lambda x: x[1],reverse = True)
                sequences = sequences[:k]
            
            pre_sequences = []
            for i in range(len(sequences)):

                cur_sequence = sequences[i]

                cur_token = tf.cast(tf.fill((1,1),cur_sequence[0][-1]),end_token.dtype)

                cur_state = cur_sequence[2]

                if cur_token == end_token:
                    final_sequences.append(copy.deepcopy(cur_sequence))
                    continue

                next_tokens,state,next_logits = generate_next_token(decoder = model.decoder,
                                                                        context = context,
                                                                        next_token = cur_token,
                                                                        state = cur_state,
                                                                        beam_width = k)
                
    
                my_sequences = [copy.deepcopy(cur_sequence) for x in range(k)]

                for i in range(len(my_sequences)):
                    my_sequences[i][0].append(next_tokens[i])

                    my_sequences[i][1]+=next_logits[i]

                    my_sequences[i][2] = state

                pre_sequences+=my_sequences
                

            sequences = pre_sequences

    def cleaning(list_sequences):
        my_list = []
        list_sequences.sort(key = lambda x: x[1],reverse = True)
        
        if len(list_sequences)>k:
                list_sequences = list_sequences[:k]
        for sequence in list_sequences:
            my_tokens = sequence[0]
            score = sequence[1]
            translation = tokens_to_text(my_tokens,id_to_word)
            translation = translation.numpy().decode()
            my_list.append([translation,f'score: {round(score,3)}'])
        return my_list
    return cleaning(final_sequences)

In [40]:
def generate_next_token(decoder,context,next_token,state,beam_width):
    
    logits,state = decoder(context,next_token,state,return_state = True)

    logits = logits[:,-1,:]

    next_logits , next_tokens = tf.nn.top_k(logits,k = beam_width)

    next_logits = tf.squeeze(next_logits).numpy()

    next_tokens = tf.squeeze(next_tokens).numpy()
    if beam_width == 1:
        return [next_tokens],state,[next_logits]

    return next_tokens,state,next_logits

In [41]:
eng_data = [x[0] for x in data]
spa_data = [x[1] for x in data]

In [48]:
p = 12345
print(eng_data[p])
print(spa_data[p])

It didn't take Tom long to realize he wasn't welcome there.
 No le tomó mucho a Tom darse cuenta de que no era bienvenido allí. 


In [50]:
#the loss here is not probability but log probability because we used activation as tf.nn.log_softmax

In [52]:
translate(translator,eng_data[p],beam_width=3)

[['[SOS] no escrib a tom mucho suficiente cuenta que no era bienvenido all . [EOS]',
  'score: -13.274'],
 ['[SOS] no escrib a tom mucho suficiente cuenta que no fue bienvenido all . [EOS]',
  'score: -14.214'],
 ['[SOS] no escrib a tom mucho suficiente cuenta que no era bienvenido ah . [EOS]',
  'score: -14.807']]