# Build a Transformer

## Initial Setup

In [1]:
# Initial imports
import os
import spacy
import requests
import tarfile

from typing import List, Tuple, Dict
from pprint import pprint
from pathlib import Path
from collections import Counter

## Word Embedding

### Word Tokenization with the Spacy Library

In [2]:
url: str = (
    "https://raw.githubusercontent.com/neychev/"
    "small_DL_repo/master/datasets/Multi30k/training.tar.gz"
)

# Create the data folder if it doesn't exist
DATA_FOLDER: str = (
    "/llm_app/learning/build_a_text_to_image_generator_from_scratch/files"
)
os.makedirs(DATA_FOLDER, exist_ok=True)

# Download the training data if it doesn't exist
if not os.path.exists(f"{DATA_FOLDER}/training.tar.gz"):

    # Download the file
    fb1 = requests.get(url)
    with open(f"{DATA_FOLDER}/training.tar.gz", "wb") as f:
        f.write(fb1.content)


train = tarfile.open(f"{DATA_FOLDER}/training.tar.gz")
train.extractall(DATA_FOLDER)

train.close()

In [3]:
# Disk usage of the downloaded data
!du -sh {DATA_FOLDER}/*

2.1M	/llm_app/learning/build_a_text_to_image_generator_from_scratch/files/train.de
1.8M	/llm_app/learning/build_a_text_to_image_generator_from_scratch/files/train.en
1.2M	/llm_app/learning/build_a_text_to_image_generator_from_scratch/files/training.tar.gz


In [4]:
with open(str(Path(DATA_FOLDER) / "train.de"), "rb") as fb:
    trainde = fb.readlines()

with open(str(Path(DATA_FOLDER) / "train.en"), "rb") as fb:
    trainen = fb.readlines()

trainde: List[str] = [i.decode("utf-8").strip() for i in trainde]
trainen: List[str] = [i.decode("utf-8").strip() for i in trainen]

In [5]:
print(f">>> The length of the list trainde is {len(trainde)}")
print(f">>> The length of the list trainen is {len(trainen)}")

print()
print(">>> The first five elements of the list trainde are")
pprint(trainde[-5:])

print()
print(">>> The first five elements of the list trainen are")
pprint(trainen[-5:])

>>> The length of the list trainde is 29001
>>> The length of the list trainen is 29001

>>> The first five elements of the list trainde are
['Ein Bergsteiger übt an einer Kletterwand.',
 'Zwei Bauarbeiter arbeiten auf einer Straße vor einem Hauses.',
 'Ein älterer Mann sitzt mit einem Jungen mit einem Wagen vor einer Fassade.',
 'Ein Mann in Shorts und Hawaiihemd lehnt sich über das Geländer eines '
 'Lotsenboots, mit Nebel und Bergen im Hintergrund.',
 '']

>>> The first five elements of the list trainen are
['A rock climber practices on a rock climbing wall.',
 "Two male construction workers are working on a street outside someone's home",
 'An elderly man sits outside a storefront accompanied by a young boy with a '
 'cart.',
 'A man in shorts and a Hawaiian shirt leans over the rail of a pilot boat, '
 'with fog and mountains in the background.',
 '']


In [6]:
try:
    de_tokenizer = spacy.load("de_core_news_sm")
except IOError:
    os.system("python -m spacy download de_core_news_sm")
    de_tokenizer = spacy.load("de_core_news_sm")

try:
    en_tokenizer = spacy.load("en_core_web_sm")
except IOError:
    os.system("python -m spacy download en_core_web_sm")
    en_tokenizer = spacy.load("en_core_web_sm")

In [7]:
tokenized_de: List[str] = [tok.text for tok in de_tokenizer.tokenizer(trainde[0])]
tokenized_en: List[str] = [tok.text for tok in en_tokenizer.tokenizer(trainen[0])]

print(tokenized_de)
print(tokenized_en)

['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [8]:
tokenized_de1: List[str] = [tok.text for tok in de_tokenizer.tokenizer(trainde[1])]
tokenized_en1: List[str] = [tok.text for tok in en_tokenizer.tokenizer(trainen[1])]

print(tokenized_de1)
print(tokenized_en1)

['Mehrere', 'Männer', 'mit', 'Schutzhelmen', 'bedienen', 'ein', 'Antriebsradsystem', '.']
['Several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


In [9]:
PAD: int = 0
UNK: int = 1

In [10]:
en_tokens: List[List[str]] = [
    ["BOS"] + [tok.text for tok in en_tokenizer.tokenizer(x)] + ["EOS"] for x in trainen
]

word_count = Counter()
for sentence in en_tokens:
    for word in sentence:
        word_count[word] += 1

frequency: List[Tuple[str, int]] = word_count.most_common(50000)
total_en_words: int = len(frequency) + 2

# A dictionary mapping tokens to indexes
en_word_dict = {w[0]: idx + 2 for idx, w in enumerate(frequency)}
en_word_dict["PAD"] = PAD
en_word_dict["UNK"] = UNK

# Another dictionary to map indexes to tokens
en_idx_dict = {v: k for k, v in en_word_dict.items()}

In [11]:
enidx: List[int] = [en_word_dict.get(i, UNK) for i in tokenized_en]

print(enidx)

[19, 25, 15, 1165, 804, 17, 57, 84, 334, 1329, 5]


In [12]:
entokens: List[str] = [en_idx_dict.get(i, "UNK") for i in enidx]
print(entokens)

en_phrase: str = " ".join(entokens)
for x in """?:;.,'("-!&)%""":
    en_phrase = en_phrase.replace(f" {x}", f"{x}")

print(en_phrase)

['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
Two young, White males are outside near many bushes.


In [13]:
# Do the same for German phrases
de_tokens: List[List[str]] = [
    ["BOS"] + [tok.text for tok in de_tokenizer.tokenizer(x)] + ["EOS"] for x in trainde
]

de_word_count = Counter()
for sentence in de_tokens:
    for word in sentence:
        de_word_count[word] += 1

defrequency: List[Tuple[str, int]] = de_word_count.most_common(50000)
total_de_words: int = len(defrequency) + 2

de_word_dict: Dict[str, int] = {w[0]: idx + 2 for idx, w in enumerate(defrequency)}
de_word_dict["PAD"] = PAD
de_word_dict["UNK"] = UNK

de_idx_dict: Dict[int, str] = {v: k for k, v in de_word_dict.items()}

In [14]:
deidx: List[int] = [de_word_dict.get(i, UNK) for i in tokenized_de]
print(deidx)

[21, 85, 257, 31, 87, 22, 94, 7, 16, 112, 5497, 3161, 4]


In [15]:
detokens: List[str] = [de_idx_dict.get(i, "UNK") for i in deidx]
print(detokens)

de_phrase: str = " ".join(detokens)

for x in """?:;.,'("-!&)%""":
    de_phrase: str = de_phrase.replace(f" {x}", f"{x}")

print(de_phrase)

['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.


### A Sequence Padding Function