<a href="https://colab.research.google.com/github/Teganmosi/Translation-model/blob/main/spanish_to_english_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
!pip install nltk


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
import os
import pickle
import sys

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import utils_preproc
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU, Dense, Embedding, Input
from tensorflow.keras.models import Model, load_model

print(tf.__version__)

2.12.0


In [48]:
SEED = 0
MODEL_PATH = "translate_models/baseline"
DATA_URL = (
    "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
)
LOAD_CHECKPOINT = False

In [49]:
tf.random.set_seed(SEED)

In [50]:
path_to_zip = tf.keras.utils.get_file(
    "spa-eng.zip", origin=DATA_URL, extract=True
)

path_to_file = os.path.join(os.path.dirname(path_to_zip), "spa-eng/spa.txt")
print("Translation data stored at:", path_to_file)

Translation data stored at: /root/.keras/datasets/spa-eng/spa.txt


In [51]:
data = pd.read_csv(
    path_to_file, sep="\t", header=None, names=["english", "spanish"]
)

In [52]:
data.sample(10)

Unnamed: 0,english,spanish
43705,Give some meat to the dog.,Dale un cacho de carne al perro.
54629,Tom doesn't want to be here.,Tom no quiere estar aquí.
7916,I called him up.,Lo llamé.
107731,I will have him come here the day after tomorrow.,Pienso hacerlo venir aquí pasado mañana.
38244,Tom will quit on Monday.,Tomás va a renunciar el lunes.
73432,Men don't drive as well as women.,Los hombres no conducen tan bien como las muje...
41806,The girls began to laugh.,Las niñas empezaron a reírse.
84747,All of us were surprised at the news.,Las noticias nos sorprendieron a todos.
5793,He looked well.,Él tenía buen aspecto.
72009,He could not hold back his tears.,Él no podía contener sus lágrimas.


In [53]:
raw = [
    "No estamos comiendo.",
    "Está llegando el invierno.",
    "El invierno se acerca.",
    "Tom no comio nada.",
    "Su pierna mala le impidió ganar la carrera.",
    "Su respuesta es erronea.",
    "¿Qué tal si damos un paseo después del almuerzo?",
]

import re

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[^a-z?.!,]+', ' ', sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

processed = [preprocess_sentence(sentence) for sentence in raw]
print(processed)

['<start> no estamos comiendo . <end>', '<start> est llegando el invierno . <end>', '<start> el invierno se acerca . <end>', '<start> tom no comio nada . <end>', '<start> su pierna mala le impidi ganar la carrera . <end>', '<start> su respuesta es erronea . <end>', '<start> qu tal si damos un paseo despu s del almuerzo ? <end>']


In [68]:
def load_and_preprocess(path, num_examples):
    with open(path_to_file) as fp:
        lines = fp.read().strip().split("\n")


    sentence_pairs = [
        [utils_preproc.preprocess_sentence(sent) for sent in line.split("\t")]
        for line in lines[:num_examples]
    ]

    return zip(*sentence_pairs)

In [69]:
en, sp = load_and_preprocess(path_to_file, num_examples=10)

print(en[-1])
print(sp[-1])

<start> fire ! <end>
<start> incendio ! <end>


In [70]:
def load_and_integerize(path, num_examples=None):
    targ_lang, inp_lang = load_and_preprocess(path, num_examples)


    input_tensor, inp_lang_tokenizer = utils_preproc.tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = utils_preproc.tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [71]:
TEST_PROP = 0.2
NUM_EXAMPLES = 3000


In [72]:
input_tensor, target_tensor, inp_lang, targ_lang = load_and_integerize(
    path_to_file, NUM_EXAMPLES
)

In [73]:
max_length_targ = target_tensor.shape[1]
max_length_inp = input_tensor.shape[1]

In [74]:
splits = train_test_split(
    input_tensor, target_tensor, test_size=TEST_PROP, random_state=SEED
)

input_tensor_train = splits[0]
input_tensor_val = splits[1]

target_tensor_train = splits[2]
target_tensor_val = splits[3]

In [75]:
(
    len(input_tensor_train),
    len(target_tensor_train),
    len(input_tensor_val),
    len(target_tensor_val),
)

(2400, 2400, 600, 600)

In [76]:
print("Input Language; int to word mapping")
print(input_tensor_train[0])
print(utils_preproc.int2word(inp_lang, input_tensor_train[0]), "\n")

print("Target Language; int to word mapping")
print(target_tensor_train[0])
print(utils_preproc.int2word(targ_lang, target_tensor_train[0]))

Input Language; int to word mapping
[ 1 38 87  3  2  0  0  0  0  0  0]
['<start>', 'nos', 'vamos', '.', '<end>', '', '', '', '', '', ''] 

Target Language; int to word mapping
[  1  14  22 249   3   2   0   0]
['<start>', 'we', 're', 'going', '.', '<end>', '', '']


In [77]:
def create_dataset(encoder_input, decoder_input):


    # shift ahead by 1
    target = tf.roll(decoder_input, -1, 1)

    # replace last column with 0s
    zeros = tf.zeros([target.shape[0], 1], dtype=tf.int32)
    target = tf.concat((target[:, :-1], zeros), axis=-1)

    dataset = tf.data.Dataset.from_tensor_slices(
        ((encoder_input, decoder_input), target)
    )

    return dataset

In [78]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64

In [79]:
train_dataset = (
    create_dataset(input_tensor_train, target_tensor_train)
    .shuffle(BUFFER_SIZE)
    .repeat()
    .batch(BATCH_SIZE, drop_remainder=True)
)


eval_dataset = create_dataset(input_tensor_val, target_tensor_val).batch(
    BATCH_SIZE, drop_remainder=True
)

In [80]:
EMBEDDING_DIM = 256
HIDDEN_UNITS = 1024

INPUT_VOCAB_SIZE = len(inp_lang.word_index) + 1
TARGET_VOCAB_SIZE = len(targ_lang.word_index) + 1

In [81]:
encoder_inputs = Input(shape=(None,), name="encoder_input")

# TODO 2a
encoder_inputs_embedded = Embedding(
    input_dim=INPUT_VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    input_length=max_length_inp,
)(encoder_inputs)

encoder_rnn = GRU(
    units=HIDDEN_UNITS,
    return_sequences=True,
    return_state=True,
    recurrent_initializer="glorot_uniform",
)

encoder_outputs, encoder_state = encoder_rnn(encoder_inputs_embedded)

In [82]:
decoder_inputs = Input(shape=(None,), name="decoder_input")

# TODO 2b
decoder_inputs_embedded = Embedding(
    input_dim=TARGET_VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    input_length=max_length_targ,
)(decoder_inputs)

decoder_rnn = GRU(
    units=HIDDEN_UNITS,
    return_sequences=True,
    return_state=True,
    recurrent_initializer="glorot_uniform",
)

decoder_outputs, decoder_state = decoder_rnn(
    decoder_inputs_embedded, initial_state=encoder_state
)

In [83]:
decoder_dense = Dense(TARGET_VOCAB_SIZE, activation="softmax")

predictions = decoder_dense(decoder_outputs)

In [84]:

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=predictions)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    488704      ['encoder_input[0][0]']          
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    235008      ['decoder_input[0][0]']          
                                                                                              

In [85]:
STEPS_PER_EPOCH = len(input_tensor_train) // BATCH_SIZE
EPOCHS = 1


history = model.fit(
    train_dataset,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=eval_dataset,
    epochs=EPOCHS,
)



In [86]:
if LOAD_CHECKPOINT:
    encoder_model = load_model(os.path.join(MODEL_PATH, "encoder_model.h5"))
    decoder_model = load_model(os.path.join(MODEL_PATH, "decoder_model.h5"))

else:

    encoder_model = Model(inputs=encoder_inputs, outputs=encoder_state)

    decoder_state_input = Input(
        shape=(HIDDEN_UNITS,), name="decoder_state_input"
    )

    # Reuses weights from the decoder_rnn layer
    decoder_outputs, decoder_state = decoder_rnn(
        decoder_inputs_embedded, initial_state=decoder_state_input
    )

    # Reuses weights from the decoder_dense layer
    predictions = decoder_dense(decoder_outputs)

    decoder_model = Model(
        inputs=[decoder_inputs, decoder_state_input],
        outputs=[predictions, decoder_state],
    )

In [87]:
def decode_sequences(input_seqs, output_tokenizer, max_decode_length=50):
    """
    Arguments:
    input_seqs: int tensor of shape (BATCH_SIZE, SEQ_LEN)
    output_tokenizer: Tokenizer used to conver from int to words

    Returns translated sentences
    """
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seqs)

    # Populate the first character of target sequence with the start character.
    batch_size = input_seqs.shape[0]
    target_seq = tf.ones([batch_size, 1])

    decoded_sentences = [[] for _ in range(batch_size)]

    # Sampling loop
    for i in range(max_decode_length):
        output_tokens, decoder_state = decoder_model.predict(
            [target_seq, states_value]
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[:, -1, :], axis=-1)

        tokens = utils_preproc.int2word(output_tokenizer, sampled_token_index)

        for j in range(batch_size):
            decoded_sentences[j].append(tokens[j])

        # Update the target sequence (of length 1).
        target_seq = tf.expand_dims(tf.constant(sampled_token_index), axis=-1)

        # Update states
        states_value = decoder_state

    return decoded_sentences

In [88]:
sentences = [
    "No estamos comiendo.",
    "Está llegando el invierno.",
    "El invierno se acerca.",
    "Tom no comio nada.",
    "Su pierna mala le impidió ganar la carrera.",
    "Su respuesta es erronea.",
    "¿Qué tal si damos un paseo después del almuerzo?",
]

reference_translations = [
    "We're not eating.",
    "Winter is coming.",
    "Winter is coming.",
    "Tom ate nothing.",
    "His bad leg prevented him from winning the race.",
    "Your answer is wrong.",
    "How about going for a walk after lunch?",
]

machine_translations = decode_sequences(
    utils_preproc.preprocess(sentences, inp_lang), targ_lang, max_length_targ
)

for i in range(len(sentences)):
    print("-")
    print("INPUT:")
    print(sentences[i])
    print("REFERENCE TRANSLATION:")
    print(reference_translations[i])
    print("MACHINE TRANSLATION:")
    print(machine_translations[i])

-
INPUT:
No estamos comiendo.
REFERENCE TRANSLATION:
We're not eating.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
Está llegando el invierno.
REFERENCE TRANSLATION:
Winter is coming.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
El invierno se acerca.
REFERENCE TRANSLATION:
Winter is coming.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
Tom no comio nada.
REFERENCE TRANSLATION:
Tom ate nothing.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
Su pierna mala le impidió ganar la carrera.
REFERENCE TRANSLATION:
His bad leg prevented him from winning the race.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
Su respuesta es erronea.
REFERENCE TRANSLATION:
Your answer is wrong.
MACHINE TRANSLATION:
['i', 'm', '.', '<end>', '', '', '', '']
-
INPUT:
¿Qué tal si damos un paseo después del almuerzo?
REFERENCE TRANSLATION:
How about going for a walk after lunch?
MACHINE TRANSL

In [89]:
if not LOAD_CHECKPOINT:
    os.makedirs(MODEL_PATH, exist_ok=True)


    model.save(os.path.join(MODEL_PATH, "model.h5"))
    encoder_model.save(os.path.join(MODEL_PATH, "encoder_model.h5"))
    decoder_model.save(os.path.join(MODEL_PATH, "decoder_model.h5"))

    with open(os.path.join(MODEL_PATH, "encoder_tokenizer.pkl"), "wb") as fp:
        pickle.dump(inp_lang, fp)

    with open(os.path.join(MODEL_PATH, "decoder_tokenizer.pkl"), "wb") as fp:
        pickle.dump(targ_lang, fp)



In [90]:
#Evaluation Metric (BLEU)

def bleu_1(reference, candidate):
    reference = list(filter(lambda x: x != "", reference))  # remove padding
    candidate = list(filter(lambda x: x != "", candidate))  # remove padding
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    return nltk.translate.bleu_score.sentence_bleu(
        reference, candidate, (1,), smoothing_function
    )

In [91]:
def bleu_4(reference, candidate):
    reference = list(filter(lambda x: x != "", reference))  # remove padding
    candidate = list(filter(lambda x: x != "", candidate))  # remove padding
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    return nltk.translate.bleu_score.sentence_bleu(
        reference, candidate, (0.25, 0.25, 0.25, 0.25), smoothing_function
    )

In [92]:
%%time
num_examples = len(input_tensor_val)
bleu_1_total = 0
bleu_4_total = 0


for idx in range(num_examples):

    reference_sentence = utils_preproc.int2word(
        targ_lang, target_tensor_val[idx][1:]
    )

    decoded_sentence = decode_sequences(
        input_tensor_val[idx : idx + 1], targ_lang, max_length_targ
    )[0]

    bleu_1_total += bleu_1(reference_sentence, decoded_sentence)
    bleu_4_total += bleu_4(reference_sentence, decoded_sentence)

print(f"BLEU 1: {bleu_1_total / num_examples}")
print(f"BLEU 4: {bleu_4_total / num_examples}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
BLEU 1: 0.4365935858115754
BLEU 4: 0.0856400257634858
CPU times: user 8min 23s, sys: 11.2 s, total: 8min 34s
Wall time: 9min 50s
