## Machine Translation Project(English to Spanish)

In [6]:
import pathlib
import random
import string
import tensorflow.strings as tf_strings
import tensorflow.data as tf_data
import re
from keras.layers import TextVectorization
import keras
import tensorflow as tf 
from keras import layers
import json



## Verify acces to the GPU

In [9]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16557475479207385386
xla_global_id: -1
]


In [10]:
source:"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

In [18]:
text_file =  keras.utils.get_file(
    fname = "spa-eng.zip",
    origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract = True,
)

text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

with open(text_file, "r") as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [20]:
random.shuffle(text_pairs)

In [None]:
for i in range(5):
    print(text_pairs[i])

## Structure of the Dataset

In [23]:
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) -2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples]

print(f"{len(text_pairs)}total pairs")
print(f"{len(train_pairs)}total pairs")
print(f"{len(val_pairs)}total pairs")
print(f"{len(test_pairs)}total pairs")

118964total pairs
83276total pairs
17844total pairs
2total pairs


In [24]:
# parameters

strip_chars = string.punctuation + "Â¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64

## Vectorize the data

In [33]:
def custom_standardization(input_string):
    lowercase = tf_strings.lower(input_string)
    return tf_strings.regex_replace(lowercase,f"[{re.escape(strip_chars)}]","")

#vectorization
eng_vectorization = TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length,
)

spa_vectorization = TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length + 1,
    standardize = custom_standardization,
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

eng_vectorization_config = eng_vectorization.get_config()
eng_vectorization_config.pop('standardize',None)
eng_vocab = eng_vectorization.get_vocabulary()
with open('eng_vectorization_config.json','w',encoding='utf-8') as f:
    json.dump(eng_vectorization_config, f)
    
with open('eng_vocab.json','w', encoding='utf-8') as f:
    json.dump(eng_vocab, f)   
    
spa_vectorization_config = spa_vectorization.get_config()
spa_vectorization_config .pop('standardize',None)
spa_vocab = spa_vectorization.get_vocabulary()

with open('spa_vectorization_config.json','w',encoding='utf-8') as f:
    json.dump(spa_vectorization_config,f)

with open('spa_vectorization_config.json','w',encoding='utf-8') as f:
    json.dump(spa_vocab,f)    

def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return (
        {
            "encoder_inputs": eng, 
            "decoder_inputs": spa[:,:-1],
        }, 
        spa[:,1:],
        )
    
def make_dataset(pairs):
    eng_texts,spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
    
    

In [34]:
for inputs,targets in train_ds.take(1):
    print(inputs["encoder_inputs"].shape)
    print(targets.shape)

(64, 20)
(64, 20)
