## 🚀 Topic for Training Speed & Memory
    - AMP (mixed precision) - autocast + GradScaler, TF32/bfloat16 toggles
    - `torch.compile`(inductor) - when/why/how, safe fallbacks
    - Gradient checkpointing - huge memory saving, small speed tax
    - Dataloader/batching tuning - keeping the GPU fed
    - Quick profiling - tokens/sec, step-time, and `torch.profiler`.
    - Memory checks - peak memory, OOM patterns, tips
    NOTE: Supports: CPU, GPU

0) Setup & Imports

In [5]:
import urllib.request
from functools import reduce

from sympy.geometry.entity import scale
from sympy.physics.units import micro
!pip -q install torch tqdm tokenizers
import urllib
import os, math, time, glob, urllib.request
from typing import List, Optional
import torch
from torch import nn
from torch.nn import functional as F
from torch.cuda.amp import autocast, GradScaler
from tqdm import trange
print('PyTorch: ', torch.__version__, '| CUDA available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
PyTorch:  2.9.0 | CUDA available: False
Device:  cpu


1) Model & Data - Minimal but Realistic
    - This is a small decode-only Transformer and two data options:
        - Synthetic dataset (fast to run, great for profiling throughput)
        - Optional BPEDataset hook**: If you have bpe/tokenizer.json and text file from other can enable it below

In [6]:
class CausalSelfAttention(nn.Module):
    def __init__(self, n_embed, n_head, block_size, dropout=0.1):
        super().__init__()
        assert n_embed % n_head == 0
        self.n_head = n_head
        self.qkv = nn.Linear(n_embed, 3 * n_embed, bias=False)
        self.proj = nn.Linear(n_embed, n_embed, bias=False)
        self.attn_drop = nn.Dropout(dropout)
        self.resid_drop = nn.Dropout(dropout)
        mask = torch.tril(torch.ones(block_size, block_size))
        self.register_buffer('mask', mask.view(1,1,block_size, block_size))
    def forward(self, x):
        B,T,C = x.shape
        qkv = self.qkv(x)
        q,k,v = qkv.chunk(3, dim=-1)
        nh = self.n_head
        q = q.view(B,T,nh,-1).transpose(1,2)
        k = k.view(B,T,nh,-1).transpose(1,2)
        v = v.view(B,T,nh,-1).transpose(1,2)
        att = (q @ k.transpose(-2, -1)) / math.sqrt(k.size(-1))
        att = att.masked_fill(self.mask[:,:,:T,:T]==0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v
        y = y.transpose(1,2).contiguous().view(B,T,-1)
        y = self.resid_drop(self.proj(y))
        return y

class Block(nn.Module):
    def __init__(self, n_embed, n_head, block_size, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embed)
        self.attn = CausalSelfAttention(n_embed, n_head, block_size, dropout)
        self.ln2 = nn.LayerNorm(n_embed)
        self.mlp = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed), nn.GELU(), nn.Linear(4 * n_embed, n_embed), nn.Dropout(dropout),
        )
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, n_embed=384, n_head=6, n_layer=6, block_size=256, dropout=0.1, grad_checkpointing = False):
        super().__init__()
        self.block_size= block_size
        self.grad_checkpointing = grad_checkpointing
        self.tok_emp = nn.Embedding(vocab_size, n_embed)
        self.pos_emp = nn.Embedding(block_size, n_embed)
        self.drop = nn.Dropout(dropout)
        self.blocks = nn.ModuleList([Block(n_embed, n_head, block_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size, bias = False)
        self.apply(self.__init)
    def __init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None: nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        B,T = idx.shape
        pos = torch.arange(0, T, device=idx.device)
        x = self.tok_emp(idx)+self.pos_emp(pos)[None, :, :]
        x = self.drop(x)
        if self.grad_checkpointing and self.training:
            import torch.utils.checkpoint as ckpt
            for blk in self.blocks:
                x = ckpt.checkpoint(blk, x, use_reentrant=False)
        else:
            for blk in self.blocks:
                x = blk(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(B*T))
        return logits, loss

2) Data - Synthetic or BPE stream
    - If `bpe/tokenizer.json` exists but don't have local text files, we can auto-download Tiny Shakespeare when `AUTO_DOWNLOAD_TINY_SHAKESPEARE=True`.


In [8]:
USE_BPE = True
TOKENIZER_PATH = 'bpe/tokenizer.json'
DATA_DIR = None  # e.g., '/path/to/corpus'
AUTO_DOWNLOAD_TINY_SHAKESPEARE = True

VOCAB_SIZE = 8000
BLOCK_SIZE = 256
BATCH_SIZE = 128 if torch.cuda.is_available() else 32
TRAIN_TOKENS = 200_000
VAL_TOKENS   = 20_000

def make_stream(n_tokens:int, vocab_size:int):
    return torch.randint(0, vocab_size, (n_tokens,), dtype=torch.long)

train_stream = make_stream(TRAIN_TOKENS, VOCAB_SIZE)
val_stream   = make_stream(VAL_TOKENS,   VOCAB_SIZE)

def get_batch(stream: torch.Tensor, block_size:int, batch_size:int, device='cpu'):
    hi = len(stream) - block_size - 1
    idx = torch.randint(0, hi, (batch_size,))
    x = torch.stack([stream[i:i+block_size] for i in idx])
    y = torch.stack([stream[i+1:i+1+block_size] for i in idx])
    return x.to(device), y.to(device)

if USE_BPE and os.path.exists(TOKENIZER_PATH):
    from tokenizers import Tokenizer
    tok = Tokenizer.from_file(TOKENIZER_PATH)
    eos_id = tok.token_to_id('<eos>')
    texts: List[str] = []
    if DATA_DIR and os.path.isdir(DATA_DIR):
        for ext in ['*.txt','*.md','*.py','*.js','*.ts','*.java','*.go','*.rs','*.c','*.cpp']:
            for p in glob.glob(os.path.join(DATA_DIR, '**', ext), recursive=True):
                try:
                    s = open(p, 'r', encoding='utf-8', errors='ignore').read()
                    if len(s) >= 100:
                        texts.append(s)
                except Exception:
                    pass
    if not texts and AUTO_DOWNLOAD_TINY_SHAKESPEARE:
        os.makedirs('data', exist_ok=True)
        url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
        urllib.request.urlretrieve(url, 'data/input.txt')
        texts.append(open('data/input.txt','r',encoding='utf-8').read())
        print('[BPE] Downloaded tiny Shakespeare as fallback')
    if texts:
        ids = []
        for t in texts:
            enc = tok.encode(t).ids
            if enc:
                ids.extend(enc)
                if eos_id is not None:
                    ids.append(eos_id)
        data = torch.tensor(ids, dtype=torch.long)
        n = int(0.9 * len(data))
        train_stream, val_stream = data[:n], data[n:]
        VOCAB_SIZE = tok.get_vocab_size()
        print('[BPE] Using BPE stream | vocab_size=', VOCAB_SIZE, '| lengths:', len(train_stream), len(val_stream))
    else:
        print('[BPE] No texts found and AUTO_DOWNLOAD_TINY_SHAKESPEARE=False → using synthetic stream')
else:
    print('BPE not enabled or tokenizer missing → using synthetic stream')

[BPE] Downloaded tiny Shakespeare as fallback
[BPE] Using BPE stream | vocab_size= 8000 | lengths: 248230 27582


3) Speed & Memory Toggles

In [12]:
use_amp  = (device.type == 'cuda' if torch.cuda.is_available() else 'cpu')
use_ckpt = True
use_compile = True if hasattr(torch, 'compile') else False

if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    try:
        torch.set_float32_matmul_precision('medium')
    except Exception:
        pass

model = MiniGPT(VOCAB_SIZE, n_embed=384, n_head=6, n_layer=6, block_size=BLOCK_SIZE, dropout=0.1, grad_checkpointing=use_ckpt).to(device)
if use_compile and device.type in ('cuda','cpu'):
    try:
        model = torch.compile(model, mode='max-autotune')
        print('torch.compile: enabled')
    except Exception as e:
        print('torch.compile: disabled →', repr(e))
        use_compile = False
else:
    print('torch.compile: not available on this backend')

torch.compile: enabled


4) Training Loop - Warmup + cosine, AMP, grad accumulation

In [14]:
LR = 3e-4
MAX_STEPS = 400
WARMUP_STEPS = 50
USE_COSINE = True
GRAD_ACCUM_STEPS = 2 if device.type == 'cuda' else 1
opt = torch.optim.AdamW(model.parameters(), lr=LR)
scaler = GradScaler(enabled=use_amp)

def lr_factor(step: int) -> float:
    if step < WARMUP_STEPS:
        return max(1e-8, (step+1)/max(1, WARMUP_STEPS))
    if not USE_COSINE:
        return 1.0
    progress = (step - WARMUP_STEPS)/max(1, MAX_STEPS - WARMUP_STEPS)
    min_factor = 0.1
    return min_factor + 0.5*(1-min_factor)*(1 + math.cos(math.pi*progress))

def eval_loss(n_iter=10):
    model.eval(); s=0.0
    for _ in range(n_iter):
        xb, yb = get_batch(val_stream, BLOCK_SIZE, BATCH_SIZE, device)
        with torch.no_grad():
            _, loss = model(xb, yb)
        s += float(loss.item())
    model.train(); return s/max(1,n_iter)

for _ in range(5):
    xb, yb = get_batch(train_stream, BLOCK_SIZE, BATCH_SIZE, device)
    with autocast(enabled=use_amp, dtype=torch.bfloat16 if use_amp else None):
        _, loss = model(xb, yb)
    loss.backward(); opt.zero_grad(set_to_none=True)

torch.cuda.synchronize() if torch.cuda.is_available() else None
t_start = time.perf_counter()
tokens_processed = 0

best = float('inf')
for step in trange(MAX_STEPS):
    fac = lr_factor(step)
    for pg in opt.param_groups: pg['lr'] = LR*fac
    opt.zero_grad(set_to_none=True)
    micro_bs = max(1, BATCH_SIZE // max(1, GRAD_ACCUM_STEPS))
    for _ in range(GRAD_ACCUM_STEPS):
        xb, yb = get_batch(train_stream, BLOCK_SIZE, micro_bs, device)
        with autocast(enabled=use_amp, dtype=torch.bfloat16 if use_amp else None):
            _, loss = model(xb, yb)
            loss = loss / max(1, GRAD_ACCUM_STEPS)
        scaler.scale(loss).backward()
        tokens_processed += xb.numel()
    scaler.unscale_(opt)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(opt); scaler.update()
    if (step+1) % 100 == 0:
        vl = eval_loss(5)
        print(f"val_loss: {vl:.4f}")

torch.cuda.synchronize() if torch.cuda.is_available() else None
elapsed = time.perf_counter() - t_start
tok_per_sec = tokens_processed / elapsed
print(f"Elapsed: {elapsed:.2f}s | Tokens processed: {tokens_processed:,} | Tokens/sec: {int(tok_per_sec):,}")

  scaler = GradScaler(enabled=use_amp)
  super().__init__(
  with autocast(enabled=use_amp, dtype=torch.bfloat16 if use_amp else None):


TypeError: layer_norm(): argument 'input' (position 1) must be Tensor, not NoneType