# Chapter 6

## Initial Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# More or less two minutes to install these packages.
!pip install tensorflow tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Initial imports.
import os
import sys
import zipfile
import tiktoken
import torch

import pandas as pd

from pathlib import Path

from typing import List, Dict, Any

import urllib.request
from importlib.metadata import version

from torch.utils.data import Dataset, DataLoader

In [4]:
ancillar_path = "/llm_app/notebooks/build_large_language_models_from_scratch/"

if ancillar_path not in sys.path:
    sys.path.append(ancillar_path)

import ancillar as aux

In [5]:
# Show the version of the packages we are using.
pkgs = [
    "matplotlib",  # Plotting library.
    "numpy",       # PyTorch & TensorFlow dependency.
    "tiktoken",    # Tokenizer.
    "torch",       # Deep learning library.
    "tensorflow",   # For OpenAI's pretrained weights.
    "pandas"       # Dataset loading.
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.0
numpy version: 1.26.3
tiktoken version: 0.8.0
torch version: 2.5.1+cpu
tensorflow version: 2.19.0
pandas version: 2.2.3


## Preparing the Dataset

In [6]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [7]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    """
    Listing 6.1 Downloading and unzipping the dataset.
    """
    
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file.
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file.
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension.
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [8]:
try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [9]:
df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)

# Show it.
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [11]:
print(df["Label"].value_counts() / df.shape[0] * 100)

Label
ham     86.593683
spam    13.406317
Name: count, dtype: float64


In [12]:
def create_balanced_dataset(df):
    """
    Listing 6.2 Creating a balanced dataset.
    """
    
    # Count the instances of "spam".
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances.
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam".
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

In [13]:
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [14]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [15]:
def random_split(df, train_frac, validation_frac):
    """
    Listing 6.3 Splitting the dataset.
    """

    # Shuffle the entire DataFrame.
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices.
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame.
    train_df = df[:train_end]
    validation_df = df[train_end: validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [16]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) 

In [17]:
train_file_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection/train.csv"
validation_file_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection/validation.csv"
test_file_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection/test.csv"

train_df.to_csv(train_file_path, index=None)
validation_df.to_csv(validation_file_path, index=None)
test_df.to_csv(test_file_path, index=None)

In [18]:
print(train_df["Label"].value_counts())

Label
0    528
1    517
Name: count, dtype: int64


In [19]:
print(validation_df["Label"].value_counts())

Label
1    79
0    70
Name: count, dtype: int64


In [20]:
print(test_df["Label"].value_counts())

Label
1    151
0    149
Name: count, dtype: int64


## Creating Data Loaders

In [21]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode(
    "<|endoftext|> Parrots are green because they descend from dinossaurs <|endoftext|>", allowed_special={"<|endoftext|>"}
))

[50256, 2547, 24744, 389, 4077, 780, 484, 15350, 422, 16278, 793, 64, 1834, 220, 50256]


In [22]:
type(tokenizer)

tiktoken.core.Encoding

In [23]:
class SpamDataset(Dataset):
    
    """
    Listing 6.4 Setting up a Pytorch Dataset class.
    """
    
    def __init__(
            self, 
            csv_file: str, 
            tokenizer: tiktoken.core.Encoding, 
            max_length: int | None = None, 
            pad_token_id: int = 50256
        ) -> None:

        # Read the CSV file.
        self.data: pd.DataFrame = pd.read_csv(csv_file)

        # Pre-tokenize texts.
        self.encoded_texts: List[List[int]] = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        # Set the largest encoded length if not provided.
        if max_length is None:
            self.max_length: int = self._longest_encoded_length()
       
        # If max_length is provided, set it.
        else:
            self.max_length: int = max_length
       
            # Truncate sequences if they are longer than max_length.
            self.encoded_texts: List[List[int]] = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence.
        self.encoded_texts: List[List[int]] = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self) -> int:
        
        return len(self.data)

    def _longest_encoded_length(self) -> int:
        """
        Note: A more pythonic version to implement this method
        is the following, which is also used in the next chapter:
        return max(len(encoded_text) for encoded_text in self.encoded_texts)
        """

        max_length = 0
        for encoded_text in self.encoded_texts:
            
            encoded_length: int = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        
        return max_length

In [24]:
train_dataset = SpamDataset(
    csv_file=train_file_path,
    max_length=None,
    tokenizer=tokenizer
)

# Show the largest encoded length.
print(train_dataset.max_length)

120


In [25]:
val_dataset = SpamDataset(
    csv_file=validation_file_path,
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file=test_file_path,
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [26]:
torch.manual_seed(123)

num_workers: int = 0  # This setting ensures compatibility with most computers.
batch_size: int = 8

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False,
)

In [27]:
for input_batch, target_batch in train_loader:
    pass

print(">>> Input batch dimensions:", input_batch.shape)
print(">>> Label batch dimensions:", target_batch.shape)

>>> Input batch dimensions: torch.Size([8, 120])
>>> Label batch dimensions: torch.Size([8])


In [28]:
print(f">>> {len(train_loader)} training batches ...")
print(f">>> {len(val_loader)} validation batches ...")
print(f">>> {len(test_loader)} test batches ...")

>>> 130 training batches ...
>>> 19 validation batches ...
>>> 38 test batches ...


## Initializing a Model with Pretrained Weights

In [29]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size.
    "context_length": 1024,  # Context length.
    "drop_rate": 0.0,        # Dropout rate.
    "qkv_bias": True         # Query-key-value bias.
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

print(f">>> Base configuration:\n\t{BASE_CONFIG}")

assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG['context_length']}`"
)

>>> Base configuration:
	{'vocab_size': 50257, 'context_length': 1024, 'drop_rate': 0.0, 'qkv_bias': True, 'emb_dim': 768, 'n_layers': 12, 'n_heads': 12}


In [30]:
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)

filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

from gpt_download import download_and_load_gpt2

2025-05-28 02:44:43.934942: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-28 02:44:43.940971: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-28 02:44:43.957132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748400283.987441     120 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748400283.996632     120 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748400284.020252     120 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [31]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
model_size

settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = aux.GPTModel(BASE_CONFIG)
aux.load_weights_into_gpt(model, params)

# In evaluation mode, dropout layers are disabled and the model
# behaves deterministically.
_ = model.eval();

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


2025-05-28 02:44:54.783389: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154389504 exceeds 10% of free system memory.


In [32]:
text_1 = "Every effort moves you"

token_ids = aux.generate_text_simple(
    model=model,
    idx=aux.text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)

print(aux.token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [33]:
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

token_ids = aux.generate_text_simple(
    model=model,
    idx=aux.text_to_token_ids(text_2, tokenizer),
    max_new_tokens=75,
    context_size=BASE_CONFIG["context_length"]
)

print(aux.token_ids_to_text(token_ids, tokenizer))

Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $


## Adding a Classification Head