# Scaling Laws

## Importing libraries

In [1]:
import os
import copy
from dataclasses import dataclass
import torch
from torch.utils.data import DataLoader
import wandb
from models.mlp.mlp import MLP, MLPConfig
from models.gpt.gpt import GPT, GPTConfig
from src.utils import load_text, set_seed, configure_device
from src.tokenizer import CharTokenizer, BPETokenizer
from src.train import split_text, TextDataset, setup_optimizer, setup_scheduler, train_steps

## Configuration

In [2]:
@dataclass
class CONFIG:
    root_dir: str = os.getcwd() + "/../"
    dataset_path: str = 'data/raw/shakespeare.txt'
    device: torch.device = torch.device('cpu')  # Automatic device configuration

    # wandb
    project: str = "LLM101-Scaling-Laws"

    # Tokenizer
    tokenizer: str = "char"  # char or bpe

    # Model
    model: str = "gpt"  # gpt or mlp
    if model == "mlp":
        context_size: int = 16
        d_embed: int = 256
        d_ff: int = 1024
    elif model == "gpt":
        context_size: int = 4
        n_layer: int = 2
        n_head: int = 2
        d_embed: int = 128
        d_ff: int = 512
        dropout: float = 0.2
        flash_attention: bool = False

    # Training
    val_size: float = 0.05
    max_steps: int = 1000
    val_interval: int = 500
    batch_size: int = 64
    optimizer: str = "AdamW"  # AdamW or SGD
    learning_rate: float = 0.001
    weight_decay: float = 0.01
    scheduler: str = "cosine"  # cosine or linear
    warmup_ratio: float = 0.1
    grad_clip: float = 1.0
    mixed_precision: bool = False
    seed: int = 101

## Weights & Biases

In [3]:
wandb.login(key=os.environ.get("WANDB_API_KEY"))

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m. Use [1m`wandb login --relogin`[0m to force relogin


Wandb run initialized: y42hgz1e


## Reproducibility

In [4]:
set_seed(CONFIG.seed)

Random seed set to 101


## Device

In [5]:
CONFIG.device = configure_device()

Running on mps


## Tokenizer

In [6]:
# Initialize tokenizer
if CONFIG.tokenizer == "char":
    tokenizer = CharTokenizer()
elif CONFIG.tokenizer == "bpe":
    tokenizer = BPETokenizer()
else:
    raise ValueError("Invalid tokenizer type. Choose 'char' or 'bpe'.")

## Text to build vocabulary
vocab_text = load_text(CONFIG.root_dir + CONFIG.dataset_path)

## Build vocabulary
tokenizer.build_vocab(vocab_text)

Loaded text data from /Users/pathfinder/Documents/GitHub/LLM101/notebooks/../data/raw/shakespeare.txt (length: 1115394 characters).


## Experiments

In [7]:
def run_experiment(exp_name, model_config=None, dataset_fraction=1.0):
    """
    Runs one experiment with given model configuration and a fraction of the training dataset.

    Parameters:
      - exp_name: A string name for the experiment (will be logged in wandb).
      - model_config: A dictionary to update model hyper-parameters (e.g. for GPT: n_layer, n_head, d_embed, d_ff).
                      If None, the global CONFIG values will be used.
      - dataset_fraction: A float in (0,1] specifying the fraction of the training text to use.
                          For example, 0.5 uses half of the full training text.

    This function:
      1. Makes a local copy of the global CONFIG.
      2. Updates the model hyper-parameters if provided.
      3. Initializes a new wandb run with the local config.
      4. Loads and (optionally) truncates the training text.
      5. Builds the datasets, model, optimizer, and scheduler.
      6. Trains the model using your provided `train_steps` function.
      7. Logs all metrics to wandb.
    """
    # Copy the global config to modify it locally
    local_config = copy.deepcopy(CONFIG)
    local_config.experiment_name = exp_name  # save experiment name in config

    # If we provided new model parameters, update them.
    if model_config is not None:
        if local_config.model == "mlp":
            local_config.d_embed = model_config.get("d_embed", local_config.d_embed)
            local_config.d_ff    = model_config.get("d_ff", local_config.d_ff)
        elif local_config.model == "gpt":
            local_config.n_layer = model_config.get("n_layer", local_config.n_layer)
            local_config.n_head  = model_config.get("n_head", local_config.n_head)
            local_config.d_embed = model_config.get("d_embed", local_config.d_embed)
            local_config.d_ff    = model_config.get("d_ff", local_config.d_ff)

    # Start a new wandb run for this experiment.
    run = wandb.init(
        project=local_config.project,
        name=exp_name,
        reinit=True,
        dir=CONFIG.root_dir
    )

    # Load the full text dataset and split into training/validation.
    text = load_text(local_config.root_dir + local_config.dataset_path)
    train_text, val_text = split_text(text, local_config.val_size)

    # For dataset scaling experiments, restrict the training data.
    train_text = train_text[:int(len(train_text) * dataset_fraction)]

    # Create the dataset and dataloaders.
    train_dataset = TextDataset(train_text, tokenizer, local_config.context_size)
    val_dataset   = TextDataset(val_text, tokenizer, local_config.context_size)
    train_loader  = DataLoader(train_dataset, batch_size=local_config.batch_size, shuffle=True)
    val_loader    = DataLoader(val_dataset, batch_size=local_config.batch_size, shuffle=False)

    # Initialize the model.
    if local_config.model == "mlp":
        model_instance = MLP(MLPConfig(
            vocab_size    = tokenizer.vocab_size,
            context_size  = local_config.context_size,
            d_embed       = local_config.d_embed,
            d_ff          = local_config.d_ff
        ))
    elif local_config.model == "gpt":
        model_instance = GPT(GPTConfig(
            vocab_size    = tokenizer.vocab_size,
            context_size  = local_config.context_size,
            n_layer       = local_config.n_layer,
            n_head        = local_config.n_head,
            d_embed       = local_config.d_embed,
            d_ff          = local_config.d_ff,
            dropout       = local_config.dropout
        ))
    else:
        raise ValueError("Invalid model type. Choose 'mlp' or 'gpt'.")

    model_instance.to(local_config.device)

    # Setup optimizer and learning rate scheduler.
    optimizer = setup_optimizer(model_instance, local_config.optimizer, local_config.learning_rate, local_config.weight_decay)
    scheduler = setup_scheduler(optimizer, local_config.scheduler, local_config.warmup_ratio, len(train_loader) * local_config.max_steps)

    # (Re)seed for reproducibility.
    set_seed(local_config.seed)

    # Train the model. (Your train_steps function should log metrics to wandb via the `run` handle.)
    train_steps(
        model          = model_instance,
        train_loader   = train_loader,
        val_loader     = val_loader,
        optimizer      = optimizer,
        scheduler      = scheduler,
        max_steps      = local_config.max_steps,
        val_interval   = local_config.val_interval,
        grad_clip      = local_config.grad_clip,
        device         = local_config.device,
        wandb_run      = run
    )

    # Finish the wandb run.
    run.finish()

### Parameters N

In [8]:
# Here we vary model size (e.g. for a GPT model) while using the full dataset.
model_scales = [
    {"n_layer": 2, "n_head": 2, "d_embed": 128, "d_ff": 512},
    {"n_layer": 4, "n_head": 4, "d_embed": 256, "d_ff": 1024},
    {"n_layer": 6, "n_head": 6, "d_embed": 512, "d_ff": 2048},
]

for idx, m_conf in enumerate(model_scales):
    exp_name = f"model_scale_{idx+1}_layers_{m_conf['n_layer']}"
    run_experiment(exp_name=exp_name, model_config=m_conf, dataset_fraction=1.0)

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Loaded text data from /Users/pathfinder/Documents/GitHub/LLM101/notebooks/../data/raw/shakespeare.txt (length: 1115394 characters).
Optimizer 'adamw' initialized with learning rate: 0.001
Scheduler 'cosine' initialized with warmup steps: 1655700
Random seed set to 101


Training:  50%|████▉     | 495/1000 [00:18<00:15, 32.23it/s, loss=4.4224]
Validation:   0%|          | 0/872 [00:00<?, ?it/s][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.4153][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.4188][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.4125][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3978][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.4118][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3848][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3939][A
Validation:   1%|          | 7/872 [00:00<00:14, 61.56it/s, loss=4.3939][A
Validation:   1%|          | 7/872 [00:00<00:14, 61.56it/s, loss=4.3910][A
Validation:   1%|          | 7/872 [00:00<00:14, 61.56it/s, loss=4.3981][A
Validation:   1%|          | 7/872 [00:00<00:14, 61.56it/s, loss=4.3906][A
Validation:   1%|          | 7/872 [00:00<00:14, 61.56it/s, loss=4.3885][A
Validation:   1

Validation Loss: 4.4062, Perplexity: 81.9559


Training: 100%|█████████▉| 996/1000 [00:42<00:00, 30.26it/s, loss=4.4031]
Validation:   0%|          | 0/872 [00:00<?, ?it/s][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3591][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3564][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3497][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3373][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3472][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3205][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3288][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3249][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3324][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3244][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3230][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3192][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, los

Validation Loss: 4.3427, Perplexity: 76.9160


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning Rate,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇▇███
Perplexity,█▁
Train Loss,▃▅▄▇▅█▆▃▆▅▄▇▃▆▃▄▅█▃▆▃▂▇▄▇▆▅▄▅▃▆▅▄▂▂▂▆▃▁▄
Validation Loss,█▁

0,1
Learning Rate,0.0
Perplexity,76.91597
Train Loss,4.35915
Validation Loss,4.34271


Loaded text data from /Users/pathfinder/Documents/GitHub/LLM101/notebooks/../data/raw/shakespeare.txt (length: 1115394 characters).
Optimizer 'adamw' initialized with learning rate: 0.001
Scheduler 'cosine' initialized with warmup steps: 1655700
Random seed set to 101


Training:  50%|████▉     | 498/1000 [00:32<00:24, 20.64it/s, loss=4.3743]
Validation:   0%|          | 0/872 [00:00<?, ?it/s][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.4042][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3793][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3700][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3395][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3422][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3496][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3526][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3426][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.3422][A
Validation:   1%|          | 9/872 [00:00<00:10, 83.28it/s, loss=4.3422][A
Validation:   1%|          | 9/872 [00:00<00:10, 83.28it/s, loss=4.3451][A
Validation:   1%|          | 9/872 [00:00<00:10, 83.28it/s, loss=4.3424][A
Validation:   1%|          | 9/

Validation Loss: 4.3147, Perplexity: 74.7898


Training: 100%|█████████▉| 997/1000 [01:11<00:00, 17.87it/s, loss=4.2781]
Validation:   0%|          | 0/872 [00:00<?, ?it/s][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.1574][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.1234][A
Validation:   0%|          | 0/872 [00:00<?, ?it/s, loss=4.1060][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.1060][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.0835][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.0679][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.0773][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.0764][A
Validation:   0%|          | 3/872 [00:00<00:30, 28.34it/s, loss=4.0640][A
Validation:   1%|          | 8/872 [00:00<00:21, 39.42it/s, loss=4.0640][A
Validation:   1%|          | 8/872 [00:00<00:21, 39.42it/s, loss=4.0640][A
Validation:   1%|          | 8/872 [00:00<00:21, 39.42it/s,

Validation Loss: 4.0512, Perplexity: 57.4647


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Learning Rate,▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
Perplexity,█▁
Train Loss,▇▇▇▇▇▇▇█████▇▇▇█▇▇▇▆▇▆▆▆▇▆▆▆▇▅▄▄▄▄▄▂▁▂▂▂
Validation Loss,█▁

0,1
Learning Rate,0.0
Perplexity,57.46467
Train Loss,4.02171
Validation Loss,4.05117


Loaded text data from /Users/pathfinder/Documents/GitHub/LLM101/notebooks/../data/raw/shakespeare.txt (length: 1115394 characters).


ValueError: d_embed must be divisible by n_head

### Dataset D

In [None]:
# Here we keep the model fixed (using the default CONFIG values) but reduce the amount of training data.
dataset_scales = [0.25, 0.5, 1.0]  # 25%, 50%, and 100% of the training text

for idx, ds_frac in enumerate(dataset_scales):
    exp_name = f"dataset_scale_{int(ds_frac*100)}pct"
    run_experiment(exp_name=exp_name, model_config=None, dataset_fraction=ds_frac)

### Compute C

In [None]:
# In these experiments we change both the model size and the dataset fraction.
# (For instance, one might try a smaller model with more data vs. a larger model with less data under a fixed compute budget.)
model_vs_dataset_experiments = [
    {"model_config": {"n_layer": 2, "n_head": 2, "d_embed": 128, "d_ff": 512}, "dataset_fraction": 1.0},
    {"model_config": {"n_layer": 4, "n_head": 4, "d_embed": 256, "d_ff": 1024}, "dataset_fraction": 0.5},
    {"model_config": {"n_layer": 6, "n_head": 6, "d_embed": 512, "d_ff": 2048}, "dataset_fraction": 0.25},
]

for idx, exp in enumerate(model_vs_dataset_experiments):
    exp_name = f"model_vs_dataset_{idx+1}"
    run_experiment(exp_name=exp_name, model_config=exp["model_config"], dataset_fraction=exp["dataset_fraction"])
