<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

Packages that are being used in this notebook:

In [1]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.4.0
tiktoken version: 0.7.0


In [3]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

# reading the-verdict text to raw_text
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Encoding the raw_text
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
context_length = 1024

# torch.nn.Embeddings(vocab_size, output_dim)
# Creates a lookup table for mapping discrete input values top dense vector representations (embeddings)
# These embeddings capture semantic relationships and similarities ebtween the input values.
# Input = tensor of integer indices, where each index corresponds to a specific input value.
# Output = each vector correspopnds to the embedding of the input value at the respective index.
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

In [4]:
for batch in dataloader:
    x, y = batch
    print(f"x: {x}")
    print(f"y: {y}")

x: tensor([[ 4544,  9325,   701,     8],
        [  286,   262,  5977,   373],
        [ 1309,   502,   572,   438],
        [  508, 40987,  1637,   508],
        [  470,   765,   284,   467],
        [  257, 14093,   526,   198],
        [  340,  1392,   284,   307],
        [19713, 14676,    25,  9675]])
y: tensor([[ 9325,   701,     8,   373],
        [  262,  5977,   373, 29178],
        [  502,   572,   438,   392],
        [40987,  1637,   508,   651],
        [  765,   284,   467,   319],
        [14093,   526,   198,   198],
        [ 1392,   284,   307,  2081],
        [14676,    25,  9675,   284]])
x: tensor([[32796,  2637,  3244,   465],
        [  290,  7342,   502,    11],
        [ 2259,    26,   290,   673],
        [  389,   262, 33204,   345],
        [  257,  1310,  4295,   438],
        [ 7787,   673,  3636,   470],
        [  281, 22037,   286,   262],
        [ 1302,   994,   345,   460]])
y: tensor([[ 2637,  3244,   465,  6283],
        [ 7342,   502,    11,   262

In [3]:
for batch in dataloader:
    # x = input
    # y = the output aka the batch of predicted tokens
    x, y = batch
    
    token_embeddings = token_embedding_layer(x)

    # torch.arrage(x) returns a PyTorch tensor instead of list
    # x = torch.arange(10)
    # print(x)  # Output: tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [4]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])
