## Get the data file

In [5]:
from homemadetransformer.config import DATA_DIR

In [7]:
print(DATA_DIR)

C:\Users\willi\Desktop\AIPortfolio\HomemadeTransformer\data


In [10]:
file_path = DATA_DIR / "raw" / "spa.txt"

with open(file_path, encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

In [11]:
def clean_pair(line):
    parts = line.strip().split("\t")

    if len(parts) < 2:
        print(f"PROBLEM: {line}")
        return None
    eng = parts[0].strip()
    spa = parts[1].strip().split("CC-BY")[0].strip()  # Remove attribution text
    return eng, spa

In [12]:
clean_pairs = [clean_pair(line) for line in lines]
clean_pairs = [pair for pair in clean_pairs if pair is not None]

In [13]:
clean_pairs[:10]

[('Go.', 'Ve.'),
 ('Go.', 'Vete.'),
 ('Go.', 'Vaya.'),
 ('Go.', 'Váyase.'),
 ('Hi.', 'Hola.'),
 ('Run!', '¡Corre!'),
 ('Run!', '¡Corran!'),
 ('Run!', '¡Huye!'),
 ('Run!', '¡Corra!'),
 ('Run!', '¡Corred!')]

## Simple Word Tokenizer

In [19]:
from itertools import chain

In [25]:
# Vocab objects
split_eng_sent = [["<SOS>"] + eng_sent.lower().strip().split() + ["<EOS>"] 
                  for eng_sent, spa_sent in clean_pairs]


split_spa_sent = [["<SOS>"] + spa_sent.lower().strip().split() + ["<EOS>"] 
                  for eng_sent, spa_sent in clean_pairs]

eng_vocab = set(chain.from_iterable(split_eng_sent))
spa_vocab = set(chain.from_iterable(split_spa_sent))

vocab_to_ind_eng = {word: i for i, word in enumerate(eng_vocab)}
vocab_to_ind_spa = {word: i for i, word in enumerate(spa_vocab)}

ind_to_word_eng = {i:w for w, i in vocab_to_ind_eng.items()}
ind_to_word_spa = {i:w for w, i in vocab_to_ind_spa.items()}

In [28]:
#Encode the sentences
def encode_sentence_vectors(sentence: list, vocab_to_ind: dict):
    return [vocab_to_ind[word] for word in sentence]


eng_senteces_encoded = [encode_sentence_vectors(sentence, vocab_to_ind_eng)
                        for sentence in split_eng_sent]

spa_senteces_encoded = [encode_sentence_vectors(sentence, vocab_to_ind_spa)
                        for sentence in split_spa_sent]

In [31]:
print(split_eng_sent[0])
print(eng_senteces_encoded[0])

['<SOS>', 'go.', '<EOS>']
[17490, 24991, 22338]
