# Chapter 6

## Initial Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# More or less two minutes to install these packages.
# !pip install tensorflow tqdm

In [3]:
# Initial imports.
import os
import zipfile
import tiktoken
import torch

import pandas as pd

from pathlib import Path

from typing import List, Dict, Any

import urllib.request
from importlib.metadata import version

from torch.utils.data import Dataset

In [4]:
# Show the version of the packages we are using.
pkgs = [
    "matplotlib",  # Plotting library.
    "numpy",       # PyTorch & TensorFlow dependency.
    "tiktoken",    # Tokenizer.
    "torch",       # Deep learning library.
    "tensorflow",   # For OpenAI's pretrained weights.
    "pandas"       # Dataset loading.
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.0
numpy version: 1.26.3
tiktoken version: 0.8.0
torch version: 2.5.1+cpu
tensorflow version: 2.19.0
pandas version: 2.2.3


## Preparing the Dataset

In [5]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [6]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    """
    Listing 6.1 Downloading and unzipping the dataset.
    """
    
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file.
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file.
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension.
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [7]:
try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [8]:
df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)

# Show it.
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [10]:
print(df["Label"].value_counts() / df.shape[0] * 100)

Label
ham     86.593683
spam    13.406317
Name: count, dtype: float64


In [11]:
def create_balanced_dataset(df):
    """
    Listing 6.2 Creating a balanced dataset.
    """
    
    # Count the instances of "spam".
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances.
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam".
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

In [12]:
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [13]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [14]:
def random_split(df, train_frac, validation_frac):
    """
    Listing 6.3 Splitting the dataset.
    """

    # Shuffle the entire DataFrame.
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices.
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame.
    train_df = df[:train_end]
    validation_df = df[train_end: validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [15]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) 

In [16]:
train_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/train.csv", index=None)
validation_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/validation.csv", index=None)
test_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/test.csv", index=None)

In [17]:
print(train_df["Label"].value_counts())

Label
0    528
1    517
Name: count, dtype: int64


In [18]:
print(validation_df["Label"].value_counts())

Label
1    79
0    70
Name: count, dtype: int64


In [19]:
print(test_df["Label"].value_counts())

Label
1    151
0    149
Name: count, dtype: int64


## Creating Data Loaders

In [20]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|> Parrots are green because they descend from dinossaurs <|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256, 2547, 24744, 389, 4077, 780, 484, 15350, 422, 16278, 793, 64, 1834, 220, 50256]


In [21]:
type(tokenizer)

tiktoken.core.Encoding

In [22]:
class SpamDataset(Dataset):
    
    """
    Listing 6.4 Setting up a Pytorch Dataset class.
    """
    
    def __init__(
            self, 
            csv_file: str, 
            tokenizer: tiktoken.core.Encoding, 
            max_length: int | None = None, 
            pad_token_id: int = 50256
        ) -> None:

        # Read the CSV file.
        self.data: pd.DataFrame = pd.read_csv(csv_file)

        # Pre-tokenize texts.
        self.encoded_texts: List[List[int]] = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        # Set the largest encoded length if not provided.
        if max_length is None:
            self.max_length: int = self._longest_encoded_length()
       
        # If max_length is provided, set it.
        else:
            self.max_length: int = max_length
       
            # Truncate sequences if they are longer than max_length.
            self.encoded_texts: List[List[int]] = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence.
        self.encoded_texts: List[List[int]] = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self) -> int:
        
        return len(self.data)

    def _longest_encoded_length(self) -> int:
        """
        Note: A more pythonic version to implement this method
        is the following, which is also used in the next chapter:
        return max(len(encoded_text) for encoded_text in self.encoded_texts)
        """

        max_length = 0
        for encoded_text in self.encoded_texts:
            
            encoded_length: int = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        
        return max_length

In [23]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

# Show the largest encoded length.
print(train_dataset.max_length)

120


In [24]:
val_dataset = SpamDataset(
    csv_file="/llm_app/notebooks/build_large_language_models_from_scratch/validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="/llm_app/notebooks/build_large_language_models_from_scratch/test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)