## 🚀 Topic for Training Speed & Memory
    - AMP (mixed precision) - autocast + GradScaler, TF32/bfloat16 toggles
    - `torch.compile`(inductor) - when/why/how, safe fallbacks
    - Gradient checkpointing - huge memory saving, small speed tax
    - Dataloader/batching tuning - keeping the GPU fed
    - Quick profiling - tokens/sec, step-time, and `torch.profiler`.
    - Memory checks - peak memory, OOM patterns, tips
    NOTE: Supports: CPU, GPU

0) Setup & Imports

In [43]:
# 0) Setup & global safety guards
!pip -q install torch tqdm tokenizers
import os
os.environ['TORCH_COMPILE_DISABLE'] = '1'  # ensure PT2 compile stays off
try:
    import torch
    torch._dynamo.reset()
except Exception:
    pass

import math, time, glob, urllib.request
from typing import List, Optional
import torch
from torch import nn
from torch.nn import functional as F
from torch.cuda.amp import autocast, GradScaler
from tqdm import trange
print('PyTorch:', torch.__version__, '| CUDA available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
PyTorch: 2.9.0 | CUDA available: False


In [44]:
USE_BPE = False                  # set True to use BPE stream when tokenizer JSON is present
TOKENIZER_PATH = 'bpe/tokenizer.json'
DATA_DIR = None                  # e.g., '/path/to/text_or_code_corpus'
AUTO_DOWNLOAD_TINY_SHAKESPEARE = True

USE_AMP = torch.cuda.is_available()
USE_CKPT = True                  # gradient checkpointing via checkpoint_sequential

BLOCK_SIZE = 256
VOCAB_SIZE_SYN = 8000            # synthetic stream vocab size
BATCH_SIZE = 128 if torch.cuda.is_available() else 32
GRAD_ACCUM_STEPS = 2 if torch.cuda.is_available() else 1

LR = 3e-4
MAX_STEPS = 400
WARMUP_STEPS = 50
USE_COSINE = True
CLIP_NORM = 1.0

1) Model & Data - Minimal but Realistic
    - This is a small decode-only Transformer and two data options:
        - Synthetic dataset (fast to run, great for profiling throughput)
        - Optional BPEDataset hook**: If you have bpe/tokenizer.json and text file from other can enable it below

In [45]:
class CausalSelfAttention(nn.Module):
    def __init__(self, n_embed, n_head, block_size, dropout=0.1):
        super().__init__()
        assert n_embed % n_head == 0
        self.n_head = n_head
        self.qkv = nn.Linear(n_embed, 3 * n_embed, bias=False)
        self.proj = nn.Linear(n_embed, n_embed, bias=False)
        self.attn_drop = nn.Dropout(dropout)
        self.resid_drop = nn.Dropout(dropout)
        mask = torch.tril(torch.ones(block_size, block_size))
        self.register_buffer('mask', mask.view(1,1,block_size,block_size))
    def forward(self, x):
        B,T,C = x.shape
        qkv = self.qkv(x)
        q,k,v = qkv.chunk(3, dim=-1)
        nh = self.n_head
        q = q.view(B,T,nh,-1).transpose(1,2)
        k = k.view(B,T,nh,-1).transpose(1,2)
        v = v.view(B,T,nh,-1).transpose(1,2)
        att = (q @ k.transpose(-2,-1)) / math.sqrt(k.size(-1))
        att = att.masked_fill(self.mask[:,:,:T,:T]==0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v
        y = y.transpose(1,2).contiguous().view(B,T,-1)
        y = self.resid_drop(self.proj(y))
        return y

class Block(nn.Module):
    def __init__(self, n_embed, n_head, block_size, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embed)
        self.attn = CausalSelfAttention(n_embed, n_head, block_size, dropout)
        self.ln2 = nn.LayerNorm(n_embed)
        self.mlp = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed), nn.GELU(), nn.Linear(4*n_embed, n_embed), nn.Dropout(dropout)
        )
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, n_embed=384, n_head=6, n_layer=6, block_size=256,
                 dropout=0.1, grad_checkpointing=True):
        super().__init__()
        self.block_size = block_size
        self.grad_checkpointing = grad_checkpointing
        self.tok_emb = nn.Embedding(vocab_size, n_embed)
        self.pos_emb = nn.Embedding(block_size, n_embed)
        self.drop = nn.Dropout(dropout)
        self.blocks = nn.ModuleList([Block(n_embed, n_head, block_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size, bias=False)
        self.apply(self._init)
    def _init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None: nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        B,T = idx.shape
        pos = torch.arange(0, T, device=idx.device)
        x = self.tok_emb(idx) + self.pos_emb(pos)[None,:,:]
        x = self.drop(x)
        if self.grad_checkpointing and self.training:
            import torch.utils.checkpoint as ckpt
            seq = nn.Sequential(*self.blocks)
            x = ckpt.checkpoint_sequential(seq, segments=len(self.blocks), input=x)
        else:
            for blk in self.blocks:
                x = blk(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(B*T))
        return logits, loss

2) Data - Synthetic or BPE stream
    - If `bpe/tokenizer.json` exists but don't have local text files, we can auto-download Tiny Shakespeare when `AUTO_DOWNLOAD_TINY_SHAKESPEARE=True`.


In [46]:
def make_stream(n_tokens:int, vocab_size:int):
    return torch.randint(0, vocab_size, (n_tokens,), dtype=torch.long)

def get_batch(stream: torch.Tensor, block_size:int, batch_size:int, device='cpu'):
    hi = len(stream) - block_size - 1
    if hi <= 0:
        raise RuntimeError(f"Stream too small for block_size {block_size}. len={len(stream)}")
    idx = torch.randint(0, hi, (batch_size,))
    x = torch.stack([stream[i:i+block_size] for i in idx])
    y = torch.stack([stream[i+1:i+1+block_size] for i in idx])
    return x.to(device), y.to(device)

def maybe_build_bpe_stream():
    if not USE_BPE or not os.path.exists(TOKENIZER_PATH):
        return None
    from tokenizers import Tokenizer
    tok = Tokenizer.from_file(TOKENIZER_PATH)
    eos_id = tok.token_to_id('<eos>')
    texts: List[str] = []
    if DATA_DIR and os.path.isdir(DATA_DIR):
        for ext in ['*.txt','*.md','*.py','*.js','*.ts','*.java','*.go','*.rs','*.c','*.cpp']:
            for p in glob.glob(os.path.join(DATA_DIR, '**', ext), recursive=True):
                try:
                    s = open(p, 'r', encoding='utf-8', errors='ignore').read()
                    if len(s) >= 100:
                        texts.append(s)
                except Exception:
                    pass
    if not texts and AUTO_DOWNLOAD_TINY_SHAKESPEARE:
        os.makedirs('data', exist_ok=True)
        url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
        urllib.request.urlretrieve(url, 'data/input.txt')
        texts.append(open('data/input.txt','r',encoding='utf-8').read())
        print('[BPE] Downloaded tiny Shakespeare as fallback')
    if not texts:
        print('[BPE] No texts found; falling back to synthetic stream.')
        return None
    ids = []
    for t in texts:
        enc = tok.encode(t).ids
        if enc:
            ids.extend(enc)
            if eos_id is not None:
                ids.append(eos_id)
    data = torch.tensor(ids, dtype=torch.long)
    n = int(0.9 * len(data))
    train_stream, val_stream = data[:n], data[n:]
    vocab_size = tok.get_vocab_size()
    print(f'[BPE] Using BPE stream | vocab_size={vocab_size} | lengths: {len(train_stream)}, {len(val_stream)}')
    return train_stream, val_stream, vocab_size

# Build streams
bpe = maybe_build_bpe_stream()
if bpe is None:
    VOCAB_SIZE = VOCAB_SIZE_SYN
    train_stream = make_stream(200_000, VOCAB_SIZE)
    val_stream   = make_stream(20_000,  VOCAB_SIZE)
else:
    train_stream, val_stream, VOCAB_SIZE = bpe

3) Speed & Memory Toggles

In [47]:
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    try:
        torch.set_float32_matmul_precision('medium')
    except Exception:
        pass

model = MiniGPT(
    vocab_size=VOCAB_SIZE,
    n_embed=384, n_head=6, n_layer=6,
    block_size=BLOCK_SIZE, dropout=0.1,
    grad_checkpointing=USE_CKPT
).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=LR)
scaler = GradScaler(enabled=USE_AMP)

def lr_factor(step: int) -> float:
    if step < WARMUP_STEPS:
        return max(1e-8, (step+1)/max(1, WARMUP_STEPS))
    if not USE_COSINE:
        return 1.0
    progress = (step - WARMUP_STEPS)/max(1, MAX_STEPS - WARMUP_STEPS)
    min_factor = 0.1
    return min_factor + 0.5*(1-min_factor)*(1 + math.cos(math.pi*progress))

def eval_loss(n_iter=10) -> float:
    model.eval(); s=0.0
    with torch.no_grad():
        for _ in range(n_iter):
            xb, yb = get_batch(val_stream, BLOCK_SIZE, BATCH_SIZE, device)
            _, loss = model(xb, yb)
            s += float(loss.item())
    model.train(); return s/max(1, n_iter)

# Warmup few steps (stabilize kernels)
for _ in range(3):
    xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
    dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
    with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
        _, loss = model(xb, yb)
    loss.backward(); opt.zero_grad(set_to_none=True)

torch.cuda.synchronize() if torch.cuda.is_available() else None
t_start = time.perf_counter()
tokens_processed = 0

for step in trange(MAX_STEPS):
    fac = lr_factor(step)
    for pg in opt.param_groups: pg['lr'] = LR * fac
    opt.zero_grad(set_to_none=True)
    micro_bs = max(1, BATCH_SIZE // max(1, GRAD_ACCUM_STEPS))
    for _ in range(GRAD_ACCUM_STEPS):
        xb, yb = get_batch(train_stream, BLOCK_SIZE, micro_bs, device)
        dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
        with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
            _, loss = model(xb, yb)
            loss = loss / max(1, GRAD_ACCUM_STEPS)
        scaler.scale(loss).backward()
        tokens_processed += xb.numel()
    scaler.unscale_(opt)
    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
    scaler.step(opt); scaler.update()
    if (step+1) % 100 == 0 or (step+1) == MAX_STEPS:
        vl = eval_loss(5)
        print(f"step {step+1}/{MAX_STEPS} | val_loss {vl:.4f}")

torch.cuda.synchronize() if torch.cuda.is_available() else None
elapsed = time.perf_counter() - t_start
tok_per_sec = int(tokens_processed / max(elapsed, 1e-6))
print(f"Elapsed: {elapsed:.2f}s | Tokens processed: {tokens_processed:,} | ~{tok_per_sec:,} tok/s")

  scaler = GradScaler(enabled=USE_AMP)
  with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
  with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
 25%|██▌       | 100/400 [03:30<13:46,  2.75s/it]

step 100/400 | val_loss 9.0689


 50%|█████     | 200/400 [07:04<09:19,  2.80s/it]

step 200/400 | val_loss 9.1168


 75%|███████▌  | 300/400 [10:40<04:43,  2.84s/it]

step 300/400 | val_loss 9.1727


100%|██████████| 400/400 [14:18<00:00,  2.15s/it]

step 400/400 | val_loss 9.1950
Elapsed: 858.85s | Tokens processed: 3,276,800 | ~3,815 tok/s





4) Training Loop - Warmup + cosine, AMP, grad accumulation

In [42]:
opt = torch.optim.AdamW(model.parameters(), lr=LR)
scaler = GradScaler(enabled=USE_AMP)

def lr_factor(step: int) -> float:
    if step < WARMUP_STEPS:
        return max(1e-8, (step+1)/max(1, WARMUP_STEPS))
    if not USE_COSINE:
        return 1.0
    progress = (step - WARMUP_STEPS)/max(1, MAX_STEPS - WARMUP_STEPS)
    min_factor = 0.1
    return min_factor + 0.5*(1-min_factor)*(1 + math.cos(math.pi*progress))

def eval_loss(n_iter=10) -> float:
    model.eval(); s=0.0
    with torch.no_grad():
        for _ in range(n_iter):
            xb, yb = get_batch(val_stream, BLOCK_SIZE, BATCH_SIZE, device)
            _, loss = model(xb, yb)
            s += float(loss.item())
    model.train(); return s/max(1, n_iter)

# Warmup few steps
for _ in range(3):
    xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
    dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
    with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
        _, loss = model(xb, yb)
    loss.backward(); opt.zero_grad(set_to_none=True)

torch.cuda.synchronize() if torch.cuda.is_available() else None
t_start = time.perf_counter()
tokens_processed = 0

for step in trange(MAX_STEPS):
    fac = lr_factor(step)
    for pg in opt.param_groups: pg['lr'] = LR * fac
    opt.zero_grad(set_to_none=True)
    micro_bs = max(1, BATCH_SIZE // max(1, GRAD_ACCUM_STEPS))
    for _ in range(GRAD_ACCUM_STEPS):
        xb, yb = get_batch(train_stream, BLOCK_SIZE, micro_bs, device)
        dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
        with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
            _, loss = model(xb, yb)
            loss = loss / max(1, GRAD_ACCUM_STEPS)
        scaler.scale(loss).backward()
        tokens_processed += xb.numel()
    scaler.unscale_(opt)
    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
    scaler.step(opt); scaler.update()
    if (step+1) % 100 == 0 or (step+1) == MAX_STEPS:
        vl = eval_loss(5)
        print(f"step {step+1}/{MAX_STEPS} | val_loss {vl:.4f}")

torch.cuda.synchronize() if torch.cuda.is_available() else None
elapsed = time.perf_counter() - t_start
tok_per_sec = int(tokens_processed / max(elapsed, 1e-6))
print(f"Elapsed: {elapsed:.2f}s | Tokens processed: {tokens_processed:,} | ~{tok_per_sec:,} tok/s")

  scaler = GradScaler(enabled=USE_AMP)
  with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
  return _C._get_float32_matmul_precision()
E1018 23:41:46.166000 1990 .venv/lib/python3.13/site-packages/torch/_inductor/select_algorithm.py:2820] [3/0] Exception C++ compile error
E1018 23:41:46.166000 1990 .venv/lib/python3.13/site-packages/torch/_inductor/select_algorithm.py:2820] [3/0] 
E1018 23:41:46.166000 1990 .venv/lib/python3.13/site-packages/torch/_inductor/select_algorithm.py:2820] [3/0] Command:
E1018 23:41:46.166000 1990 .venv/lib/python3.13/site-packages/torch/_inductor/select_algorithm.py:2820] [3/0] clang++ /var/folders/lf/6_tqrqqs0w5_3q_cz1dtnmtr0000gn/T/torchinductor_ankushraj/iu/ciuqmknzavcklpqyrj5lmnhbkvb5utmnjske4rvv6cvjxsz6kyww.main.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_NEON -D AT_BUILD_ARM_VEC256_WITH_SLEEF -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-

InductorError: CppCompileError: C++ compile error

Command:
clang++ /var/folders/lf/6_tqrqqs0w5_3q_cz1dtnmtr0000gn/T/torchinductor_ankushraj/q5/cq5plabjg5pvzrcrbtktyiynoyjujrplw7pbtmfiecohqrunp5zx.main.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_NEON -D AT_BUILD_ARM_VEC256_WITH_SLEEF -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -shared -fPIC -undefined dynamic_lookup -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -Werror=ignored-optimization-argument -Xclang -fopenmp -include /var/folders/lf/6_tqrqqs0w5_3q_cz1dtnmtr0000gn/T/torchinductor_ankushraj/precompiled_headers/c7mv6jl773tdyux4rvslwqz7eud43eht3ks5rt5kvi2w5u5s4rr4.h -I/opt/homebrew/opt/python@3.13/Frameworks/Python.framework/Versions/3.13/include/python3.13 -I/Users/ankushraj/Desktop/Rising Sun Labs Resource/R-S-L-Repositories/SunForgeLLM/SunForgeLLM/.venv/lib/python3.13/site-packages/torch/include -I/Users/ankushraj/Desktop/Rising Sun Labs Resource/R-S-L-Repositories/SunForgeLLM/SunForgeLLM/.venv/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -o /var/folders/lf/6_tqrqqs0w5_3q_cz1dtnmtr0000gn/T/torchinductor_ankushraj/q5/cq5plabjg5pvzrcrbtktyiynoyjujrplw7pbtmfiecohqrunp5zx.main.so -lomp -lc10 -L/opt/homebrew/opt/python@3.13/Frameworks/Python.framework/Versions/3.13/lib -L/Users/ankushraj/Desktop/Rising Sun Labs Resource/R-S-L-Repositories/SunForgeLLM/SunForgeLLM/.venv/lib/python3.13/site-packages/torch/lib

Output:
clang++: error: no such file or directory: 'Sun'
clang++: error: no such file or directory: 'Labs'
clang++: error: no such file or directory: 'Resource/R-S-L-Repositories/SunForgeLLM/SunForgeLLM/.venv/lib/python3.13/site-packages/torch/lib'


Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


5) Optional: quick timing, profiler, memory

In [48]:
torch.cuda.synchronize() if torch.cuda.is_available() else None
t0 = time.perf_counter()
xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
    _, loss = model(xb, yb)
loss.backward(); opt.zero_grad(set_to_none=True)
torch.cuda.synchronize() if torch.cuda.is_available() else None
print('One-step seconds:', time.perf_counter() - t0)

  with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):


One-step seconds: 2.2943923339998946


In [49]:
try:
    from torch.profiler import profile, ProfilerActivity
    acts = [ProfilerActivity.CPU]
    if torch.cuda.is_available(): acts.append(ProfilerActivity.CUDA)
    with profile(activities=acts, record_shapes=True, profile_memory=True) as prof:
        for _ in range(3):
            xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
            dtype = torch.bfloat16 if (USE_AMP and torch.cuda.is_available()) else torch.float32
            with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
                _, loss = model(xb, yb)
            loss.backward(); opt.zero_grad(set_to_none=True)
    print(prof.key_averages().table(sort_by='self_cuda_time_total' if torch.cuda.is_available() else 'cpu_time_total', row_limit=20))
except Exception as e:
    print('Profiler not available or failed:', repr(e))

[W1019 00:15:36.508983000 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
  with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
autograd::engine::evaluate_function: CheckpointFunct...         0.17%      11.534ms        49.80%        3.283s     218.852ms     -78.55 MB    -360.00 MB            15  
                             CheckpointFunctionBackward         0.62%      40.800ms        49.62%        3.271s     218.083ms     281.45 MB      -3.69 GB            15  
                                          aten::dropout         0.12%       7.706ms        28.14%        1.855s      18.188ms       3.66 GB      -1.05

In [50]:
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
    dtype = torch.bfloat16 if USE_AMP else torch.float32
    with autocast(enabled=USE_AMP, dtype=dtype if USE_AMP else None):
        _, loss = model(xb, yb)
    loss.backward(); opt.zero_grad(set_to_none=True)
    torch.cuda.synchronize()
    peak_mb = torch.cuda.max_memory_allocated() / (1024**2)
    print(f'Peak CUDA memory this cell: {peak_mb:.1f} MB')