<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

Packages that are being used in this notebook:

In [9]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.6.0+cu124
tiktoken version: 0.9.0


In [10]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)

In [11]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [13]:
print(input_embeddings.shape)
print(input_embeddings)

torch.Size([8, 4, 256])
tensor([[[-1.2851,  0.5053,  0.0845,  ..., -1.5528,  1.6002, -0.2618],
         [-0.1642, -1.0659,  1.0942,  ..., -0.1391,  0.2748, -1.6291],
         [ 0.2614, -0.8781, -0.0823,  ...,  1.1031, -0.8093, -0.4124],
         [ 0.3019, -0.1343, -1.9968,  ..., -0.2260, -1.3307, -1.4124]],

        [[-2.0559, -0.5012, -0.7567,  ...,  1.1022, -0.2524, -0.8624],
         [ 2.2392, -1.6364, -2.7276,  ..., -0.9090,  0.1952, -0.7978],
         [-0.0074, -2.0662,  0.3107,  ..., -0.5439,  2.0469, -1.1617],
         [ 0.1791, -1.3386,  0.2196,  ..., -2.7103,  0.3729, -1.2332]],

        [[-1.9953,  0.2921,  0.8364,  ...,  0.8426, -0.6159,  0.2916],
         [-0.0621,  0.4292, -0.5617,  ...,  0.9181, -0.3404, -0.9362],
         [ 0.3237,  1.1910, -0.6827,  ..., -1.0069,  2.0160, -0.7648],
         [ 1.8451,  1.3011, -1.4160,  ...,  0.2983, -1.8808, -1.3800]],

        ...,

        [[-1.4172, -1.8281,  0.1171,  ..., -1.0960,  1.7749, -1.2157],
         [ 1.2330, -0.4761, -0.54