# Chapter 7

## Initial Setup

In [1]:
# !pip install tensorflow

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Initial imports.
import sys
import torch

from typing import List, Dict, Tuple

from functools import partial
from torch.utils.data import DataLoader

In [4]:
ancillar_path: str = "/llm_app/learning/build_large_language_models_from_scratch/"

if ancillar_path not in sys.path:
    sys.path.append(ancillar_path)

import ancillar as aux

In [5]:
import tiktoken

from importlib.metadata import version

pkgs = [
    "numpy",  # PyTorch & TensorFlow dependency
    "matplotlib",  # Plotting library
    "tiktoken",  # Tokenizer
    "torch",  # Deep learning library
    "tqdm",  # Progress bar
    "tensorflow",  # For OpenAI's pretrained weights
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.26.3
matplotlib version: 3.10.0
tiktoken version: 0.8.0
torch version: 2.5.1+cpu
tqdm version: 4.67.1
tensorflow version: 2.20.0


## Preparing a Dataset for Supervised Instruction Fine-tuning

In [6]:
file_path = "./instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = aux.download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [7]:
print(">>> Example entry:\n", data[50])

>>> Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [8]:
print(">>> Another example entry:\n", data[999])

>>> Another example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [9]:
model_input = aux.format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [10]:
model_input = aux.format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [11]:
train_portion = int(len(data) * 0.85)  # Use 85% of the data for training.
test_portion = int(len(data) * 0.1)  # Use 10% for testing.
val_portion = (
    len(data) - train_portion - test_portion
)  # Use remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion : train_portion + test_portion]
val_data = data[train_portion + test_portion :]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


## Organizing Data into Training Batches

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [13]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

In [14]:
print(aux.custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [15]:
inputs, targets = aux.custom_collate_draft_2(batch)

print(inputs)
print()
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])

tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [16]:
inputs, targets = aux.custom_collate_fn(batch)

print(inputs)
print()
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])

tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [17]:
logits_1 = torch.tensor([
    [-1.0, 1.0],
    [-0.5, 1.5]
])

# Correct token indices to generate.
targets_1 = torch.tensor([0, 1]) 

# Compute the loss.
loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)

print(loss_1)

tensor(1.1269)


In [18]:
logits_2 = torch.tensor([
    [-1.0, 1.0],
    [-0.5, 1.5],
    [-0.5, 1.5]
])

targets_2 = torch.tensor([0, 1, 1])
loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)

print(loss_2)

tensor(0.7936)


In [19]:
targets_3 = torch.tensor([0, 1, -100])
loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)

print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)

tensor(1.1269)
loss_1 == loss_3: tensor(True)


## Creating Data Loaders for an Instruction Dataset

In [20]:
# Set the device to GPU, MPS, or CPU.
if torch.cuda.is_available():
    device = torch.device("cuda")

# Use MPS (Apple Silicon) if GPU is not available
elif torch.backends.mps.is_available():
    
    # Use PyTorch 2.9 or newer for stable mps results
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cpu


In [21]:
# Show it
device

device(type='cpu')

In [22]:
customized_collate_fn = partial(
    aux.custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

print(customized_collate_fn)

functools.partial(<function custom_collate_fn at 0x7f28241d7740>, device=device(type='cpu'), allowed_max_length=1024)


In [23]:
# You can try to increase this number if parallel Python processes are
# supported by your operating system.
num_workers = 0

# Batch size for data loaders.
batch_size = 8

In [24]:
train_dataset = aux.InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [25]:
val_dataset = aux.InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [26]:
test_dataset = aux.InstructionDataset(test_data, tokenizer)
test_loader = aux.DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [27]:
for _ in range(9):

    # Grab a single batch from the training data loader.
    example_batch: Tuple[torch.Tensor, torch.Tensor] = next(iter(train_loader))

    # Unpack the batch into inputs and targets.
    inputs, targets = example_batch

    # Show shapes
    print(inputs.shape, targets.shape)

torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 70]) torch.Size([8, 70])


In [28]:
inputs[-1]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 22743,   262,  1708, 23491,  7457,    13,
          198,   198, 21017, 23412,    25,   198,  2990,  3382,   284,   467,
          284,   262,  6918,    11,   475,   484,   423,   645,  1637,    13,
          198,   198, 21017, 18261,    25,   198,  2990,   765,   284,   467,
          284,   262,  6918,    11,   475,   484,   423,   645,  1637,    13])

In [29]:
targets[-1]

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198, 22743,   262,  1708, 23491,  7457,    13,   198,
          198, 21017, 23412,    25,   198,  2990,  3382,   284,   467,   284,
          262,  6918,    11,   475,   484,   423,   645,  1637,    13,   198,
          198, 21017, 18261,    25,   198,  2990,   765,   284,   467,   284,
          262,  6918,    11,   475,   484,   423,   645,  1637,    13, 50256])

## Loading a Pretrained LLM