# Chapter 7

## Initial Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Initial imports.
import sys

In [3]:
ancillar_path = "/llm_app/notebooks/build_large_language_models_from_scratch/"

if ancillar_path not in sys.path:
    sys.path.append(ancillar_path)

import ancillar as aux

In [None]:
from importlib.metadata import version

pkgs = [
    "numpy",  # PyTorch & TensorFlow dependency
    "matplotlib",  # Plotting library
    "tiktoken",  # Tokenizer
    "torch",  # Deep learning library
    "tqdm",  # Progress bar
    "tensorflow",  # For OpenAI's pretrained weights
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.26.3
matplotlib version: 3.10.0
tiktoken version: 0.8.0
torch version: 2.5.1+cpu
tqdm version: 4.67.1
tensorflow version: 2.19.0


## Preparing a Dataset for Supervised Instruction Fine-tuning

In [None]:
file_path = "./instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = aux.download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [None]:
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [None]:
print("Another example entry:\n", data[999])

Another example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [None]:
model_input = aux.format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [None]:
model_input = aux.format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [10]:
train_portion = int(len(data) * 0.85)  # Use 85% of the data for training.
test_portion = int(len(data) * 0.1)  # Use 10% for testing.
val_portion = (
    len(data) - train_portion - test_portion
)  # Use remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion : train_portion + test_portion]
val_data = data[train_portion + test_portion :]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


## Organizing Data into Training Batches

## Exercise 7.1: Changing Prompt Styles