# Chapter 5

## Initial Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%time

# Load packages.
import os
import sys
import torch
import tiktoken

import urllib.request

CPU times: user 1.93 s, sys: 388 ms, total: 2.32 s
Wall time: 1.76 s


In [3]:
ancillar_path = "/llm_app/notebooks/build_large_language_models_from_scratch/"
if ancillar_path not in sys.path:
    sys.path.append(ancillar_path)

import ancillar as aux

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size.
    "context_length": 256, # Shortened context length (orig: 1024).
    "emb_dim": 768,        # Embedding dimension.
    "n_heads": 12,         # Number of attention heads.
    "n_layers": 12,        # Number of layers.
    "drop_rate": 0.1,      # Dropout rate.
    "qkv_bias": False      # Query-key-value bias.
}

torch.manual_seed(123);

## Evaluating Generative Text Models

### Using GPT to Generate Text

In [5]:
model = aux.GPTModel(GPT_CONFIG_124M)

# Disable dropout during inference.
model.eval();

In [6]:
# Listing 5.1: Utility functions for text to token ID conversion.

def text_to_token_ids(text, tokenizer):

    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

    # Add batch dimension.
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)

    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):

    # Remove batch dimension.
    flat = token_ids.squeeze(0)

    return tokenizer.decode(flat.tolist())

In [7]:
%%time

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = aux.generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren
CPU times: user 2.17 s, sys: 46.8 ms, total: 2.22 s
Wall time: 833 ms


### Calculating the Text Generation Loss

In [8]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [9]:
# Disables gradient tracking since we are not training yet.
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probas.shape)                    # Shape: (batch_size, num_tokens, vocab_size)

torch.Size([2, 3, 50257])


In [10]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)

print(f">>> `token_ids` shape:\n{token_ids.shape}\n")
print(f">>> Token IDs:\n{token_ids}")

>>> `token_ids` shape:
torch.Size([2, 3, 1])

>>> Token IDs:
tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [11]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [12]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4539e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6774e-05, 4.7559e-06])


In [13]:
# Compute logarithm of all token probabilities.
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))

print(f">>> Logarithm of token probabilities:\n{log_probas}")

>>> Logarithm of token probabilities:
tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


In [14]:
# Calculate the average probability for each token
avg_log_probas = torch.mean(log_probas)

print(f">>> Average log probability:\n{avg_log_probas}")

>>> Average log probability:
-10.79397201538086


In [15]:
neg_avg_log_probas = avg_log_probas * -1

print(f">>> Negative average log probability:\n{neg_avg_log_probas}")

>>> Negative average log probability:
10.79397201538086


In [16]:
print(">>> Logits shape:", logits.shape)
print(">>> Targets shape:", targets.shape)

>>> Logits shape: torch.Size([2, 3, 50257])
>>> Targets shape: torch.Size([2, 3])


In [17]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print(">>> Flattened logits:", logits_flat.shape)
print(">>> Flattened targets:", targets_flat.shape)

>>> Flattened logits: torch.Size([6, 50257])
>>> Flattened targets: torch.Size([6])


In [18]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)

print(loss)

tensor(10.7940)


In [19]:
perplexity = torch.exp(loss)

print(perplexity)

tensor(48726.1953)


### Calculating the Training and Validation Set Losses

In [20]:
file_path = "/llm_app/notebooks/build_large_language_models_from_scratch/the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):

    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [21]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


In [22]:
# Train / validation ratio.
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))

train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

print(">>> Length of training data:", len(train_data))
print(">>> Length of validation data:", len(val_data))

>>> Length of training data: 18431
>>> Length of validation data: 2048


In [23]:
torch.manual_seed(123)

train_loader = aux.create_dataloader_v1(
    txt=train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = aux.create_dataloader_v1(
    txt=val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [24]:
print(">>> Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\n>>> Validation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

>>> Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

>>> Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [25]:
def calc_loss_batch(
        input_batch, target_batch, model, device
    ):

    # The transfer to a given device allows us to transfer the data to a GPU.
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

    return loss

In [26]:
def calc_loss_loader(
        data_loader, model, device, num_batches=None
    ):

    total_loss = 0.

    if len(data_loader) == 0:
        return float("nan")

    # Iteratives over all batches if no fixed num_batches is specified.
    elif num_batches is None:
        num_batches = len(data_loader)

    # Reduce the number of batches to match the total number of batches in the data loader
    # if num_batches exceeds the number of batches in the data loader.
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break

    return total_loss / num_batches

In [27]:
%%time

# If you have a machine with a CUDA-supported GPU, the LLM will train on the GPU without making any changes to the code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(">>> Device:", device)
model.to(device)

# For reproducibility due to the shuffling in the data loader.
torch.manual_seed(123)

# Disable gradient tracking for efficiency because we are not training, yet.
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print(">>> Training loss:", train_loss)
print(">>> Validation loss:", val_loss)

>>> Device: cpu


>>> Training loss: 10.987582206726074
>>> Validation loss: 10.98110580444336
CPU times: user 18.9 s, sys: 5.2 s, total: 24.1 s
Wall time: 6.09 s


## Training an LLM