# Chapter 6

## Initial Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# More or less two minutes to install these packages.
# !pip install tensorflow tqdm

In [None]:
from importlib.metadata import version

pkgs = [
    "matplotlib",  # Plotting library.
    "numpy",       # PyTorch & TensorFlow dependency.
    "tiktoken",    # Tokenizer.
    "torch",       # Deep learning library.
    "tensorflow",  # For OpenAI's pretrained weights.
    "pandas"       # Dataset loading.
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

In [None]:
# Initial imports.
import os
import zipfile
import tiktoken
import urllib.request

import pandas as pd

from pathlib import Path

## Preparing the Dataset

In [None]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "/llm_app/notebooks/build_large_language_models_from_scratch/sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [None]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    """
    Listing 6.1 Downloading and unzipping the dataset.
    """
    
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file.
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file.
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension.
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [None]:
try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

In [None]:
df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)

# Show it.
df

In [None]:
print(df["Label"].value_counts())

In [None]:
print(df["Label"].value_counts() / df.shape[0] * 100)

In [None]:
def create_balanced_dataset(df):
    """
    Listing 6.2 Creating a balanced dataset.
    """
    
    # Count the instances of "spam".
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances.
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam".
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

In [None]:
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

In [None]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [None]:
def random_split(df, train_frac, validation_frac):
    """
    Listing 6.3 Splitting the dataset.
    """

    # Shuffle the entire DataFrame.
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices.
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame.
    train_df = df[:train_end]
    validation_df = df[train_end: validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [None]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) 

In [None]:
train_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/train.csv", index=None)
validation_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/validation.csv", index=None)
test_df.to_csv("/llm_app/notebooks/build_large_language_models_from_scratch/test.csv", index=None)

In [None]:
print(train_df["Label"].value_counts())

In [None]:
print(validation_df["Label"].value_counts())

In [None]:
print(test_df["Label"].value_counts())

## Creating Data Loaders

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|> Parrots are green because they descend from dinossaurs <|endoftext|>", allowed_special={"<|endoftext|>"}))