# Transformer Model

In [None]:
# pytorch
import torch

# torchvision
import torchvision


# torchinfo
%pip install torchinfo
%pip install torchmetrics

# standard data handling

# plotting

# image

# system
from pathlib import Path

import requests

# timing and printing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Check the available pytorch and Cuda (GPU) Version


In [None]:
# pytroch and cuda version
print(torch.__version__)

# trochvision and cuda version
print(torchvision.__version__)

2.0.1+cu118
0.15.2+cu118


### Check the available device

1. CPU (Default)
2. Cuda (GPU acceleration is accessible)


In [None]:
# make device agnostic code (default is cpu)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Available device is: {device}")

Available device is: cpu


### Import Helper files


In [None]:
# filenames
filenames = {
    "pytorch_helper_functions.py": "https://raw.githubusercontent.com/sl2000stat/PytorchIntroduction/master/pytorch_helper_functions.py",
    "training.py": "https://raw.githubusercontent.com/sl2000stat/PytorchIntroduction/master/training.py",
    "make_predictions.py": "https://raw.githubusercontent.com/sl2000stat/PytorchIntroduction/master/make_predictions.py",
    "validation.py": "https://raw.githubusercontent.com/sl2000stat/PytorchIntroduction/master/validation.py",
    "visualizing_images.py": "https://raw.githubusercontent.com/sl2000stat/PytorchIntroduction/master/visualizing_images.py",
    # get the data as text file
    "input.txt": "https://raw.githubusercontent.com/karpathy/ng-video-lecture/master/input.txt",
}

for filename, file_path in filenames.items():
    # download helper functions from repo
    if Path(filename).is_file():
        print(f"{filename} already exists. Skipping download")

    else:
        request = requests.get(file_path)
        with open(filename, "wb") as f:
            f.write(request.content)

        print(f"Downloaded {filename}.")

pytorch_helper_functions.py already exists. Skipping download
training.py already exists. Skipping download
make_predictions.py already exists. Skipping download
validation.py already exists. Skipping download
visualizing_images.py already exists. Skipping download
input.txt already exists. Skipping download


### Set the global Seed


In [None]:
from pytorch_helper_functions import set_global_seed


# set the global seed
set_global_seed(42)

### Replicate Transformer Model with Initial Paper: "Attention is all you need"


### 1. Get the Data


In [None]:
# open the text file
with open("input.txt", encoding="utf-8") as f:
    text = f.read()

In [None]:
# get the vocal size (set creates unique entries)
chars = sorted(list(set(text)))
vocab_size = len(chars)

### 1.2 Encode the Text Data


In [None]:
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# encoder: take a string, output a list of integers
encode = lambda s: [stoi[c] for c in s]

# decoder: take a list of integers, output a string
decode = lambda l: "".join([itos[i] for i in l])

# print some examples
print(encode("Hello There"))
print(decode(encode("Hello There")))

[20, 43, 50, 50, 53, 1, 32, 46, 43, 56, 43]
Hello There


### 1.3 Split the Text Data into Training and Validatiaon sets


In [None]:
# convert data to tensors
data = torch.tensor(encode(text), dtype=torch.long)

# training and test splits
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# shapes printing
print(f"Shape of Train Data: {train_data.shape} | Shape of Val Data: {val_data.shape}")

Shape of Train Data: torch.Size([1003854]) | Shape of Val Data: torch.Size([111540])


### 1.4 Batching The Data


In [None]:
def get_batch(data, BATCH_SIZE: int, BLOCK_SIZE: int):
    """"""

    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i : i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i + 1 : i + BLOCK_SIZE + 1] for i in ix])

    # send the data to the device
    x, y = x.to(device), y.to(device)

    return x, y

In [None]:
# how many independent sequences will we process in parallel?
BATCH_SIZE = 64

# what is the maximum context length for predictions?
BLOCK_SIZE = 256