In [1]:
import train

Data Loaded
Tokens per iteration will be: 40,960
Initializing a new model from scratch
Number of parameters: 19.18M
Attention Selected
Training Started


Evaluating train: 100%|██████████| 200/200 [00:17<00:00, 11.17it/s]
Evaluating val: 100%|██████████| 200/200 [00:18<00:00, 11.00it/s]


step 0: train loss 2.9258, val loss 2.9747


Training step 0: 100%|██████████| 40/40 [00:10<00:00,  3.90it/s]


iter 0: loss 3.8065, time 46633.08ms, mfu -100.00%


Training step 1: 100%|██████████| 40/40 [00:10<00:00,  3.78it/s]


iter 1: loss 4.0380, time 10759.00ms, mfu -100.00%


Training step 2: 100%|██████████| 40/40 [00:10<00:00,  3.77it/s]


iter 2: loss 2.5838, time 10878.55ms, mfu -100.00%


Training step 3: 100%|██████████| 40/40 [00:10<00:00,  3.69it/s]


iter 3: loss 2.4303, time 11147.20ms, mfu -100.00%


Training step 4: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 4: loss 2.4450, time 11823.15ms, mfu -100.00%


Training step 5: 100%|██████████| 40/40 [00:10<00:00,  3.64it/s]


iter 5: loss 1.9745, time 11325.09ms, mfu 0.16%


In [4]:
# Imports
import os
import math
import time
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler
from torch.nn.parallel import DistributedDataParallel as DDP
from contextlib import nullcontext
import torch.distributed as dist
from tqdm import tqdm
import wandb

from tokenise import TextDataset, pad_sequences
from attention import SparseAttention
from customGPT import Block, GPT

# Constants
OUT_DIR = 'out'
EVAL_INTERVAL = 2000
LOG_INTERVAL = 1
EVAL_ITERS = 200
EVAL_ONLY = False
ALWAYS_SAVE_CHECKPOINT = True
INIT_FROM = 'scratch'
DATA_PATH = 'wiki_medical_terms'
GRAD_ACCUMULATION_STEPS = 5 * 8
BATCH_SIZE = 2
BLOCK_SIZE = 512
LOCAL_ATTN_CTX = 32
ATTN_MODE = "local"
N_LAYER = 6
N_HEAD = 8
N_EMBD = 512
DROPOUT = 0.0
BIAS = False
LEARNING_RATE = 6e-4
MAX_ITERS = 5
WEIGHT_DECAY = 1e-1
BETA1 = 0.9
BETA2 = 0.95
GRAD_CLIP = 1.0
DECAY_LR = True
WARMUP_ITERS = 200
LR_DECAY_ITERS = 1000
MIN_LR = 6e-5
BACKEND = 'gloo'
PROJECT_NAME = "Attention Benchmark"
RUN_NAME = 'Sparse Attention-mps'

# System configuration
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
DTYPE = 'float32'
COMPILE = False

# Functions and Classes
def load_data(data_path):
    df = pd.read_parquet(data_path)
    articles = df.iloc[:, 1]
    dataset = TextDataset(articles, model="gpt2", seq_length=BLOCK_SIZE)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_sequences)
    return dataloader

def init_ddp():
    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        dist.init_process_group(backend=BACKEND)
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        device = f'mps:{ddp_local_rank}' if torch.backends.mps.is_available() else f'cuda:{ddp_local_rank}'
        torch.mps.set_device(device)
        master_process = ddp_rank == 0
        seed_offset = ddp_rank
        assert GRAD_ACCUMULATION_STEPS % ddp_world_size == 0
        grad_accumulation_steps = GRAD_ACCUMULATION_STEPS // ddp_world_size
    else:
        master_process = True
        seed_offset = 0
        ddp_world_size = 1
        grad_accumulation_steps = GRAD_ACCUMULATION_STEPS
    return ddp, master_process, device, seed_offset, ddp_world_size, grad_accumulation_steps

def configure_optimizers(model, weight_decay, learning_rate, betas, device_type):
    return model.configure_optimizers(weight_decay, learning_rate, betas, device_type)

def estimate_loss(model, dataloader, eval_iters, device, ctx):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        with tqdm(total=eval_iters, desc=f"Evaluating {split}") as pbar:
            for k in range(eval_iters):
                batch = next(iter(dataloader))
                X, Y = batch['input_ids'].to(device), batch['targets'].to(device)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
                pbar.update(1)
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it, learning_rate, warmup_iters, lr_decay_iters, min_lr):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd, dropout, local_attn_ctx, attn_mode, bias=True):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias
        self.local_attn_ctx = local_attn_ctx
        self.attn_mode = attn_mode

def main():
    # Load data
    dataloader = load_data("hf://datasets/gamino/wiki_medical_terms/wiki_medical_terms.parquet")
    print('Data Loaded')

    # Initialize DDP
    ddp, master_process, device, seed_offset, ddp_world_size, gradient_accumulation_steps = init_ddp()

    # Set seeds and device properties
    torch.manual_seed(1337 + seed_offset)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    ctx = nullcontext()

    # Initialize model
    model_args = dict(n_layer=N_LAYER, n_head=N_HEAD, n_embd=N_EMBD, block_size=BLOCK_SIZE,
                      bias=BIAS, vocab_size=50000, dropout=DROPOUT,
                      local_attn_ctx=LOCAL_ATTN_CTX, attn_mode=ATTN_MODE)
    if INIT_FROM == 'scratch':
        print("Initializing a new model from scratch")
        gptconf = GPTConfig(**model_args)
        model = GPT(SparseAttention, gptconf)
    elif INIT_FROM == 'resume':
        print(f"Resuming training from {OUT_DIR}")
        ckpt_path = os.path.join(OUT_DIR, 'ckpt.pt')
        checkpoint = torch.load(ckpt_path, map_location=device)
        checkpoint_model_args = checkpoint['model_args']
        for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
            model_args[k] = checkpoint_model_args[k]
        gptconf = GPTConfig(**model_args)
        model = GPT(gptconf)
        model.load_state_dict(checkpoint['model'])
        iter_num = checkpoint['iter_num']
        best_val_loss = checkpoint['best_val_loss']
    elif INIT_FROM.startswith('gpt2'):
        print(f"Initializing from OpenAI GPT-2 weights: {INIT_FROM}")
        model = GPT.from_pretrained(INIT_FROM, model_args)
        if BLOCK_SIZE < model.config.block_size:
            model.crop_block_size(BLOCK_SIZE)

    model.to(device)

    # Initialize optimizer and GradScaler
    optimizer = configure_optimizers(model, WEIGHT_DECAY, LEARNING_RATE, (BETA1, BETA2), device)
    scaler = GradScaler(enabled=(DTYPE == 'float16' and 'cuda' in device))

    # Wrap model in DDP if necessary
    if ddp:
        model = DDP(model, device_ids=[int(os.environ['LOCAL_RANK'])])

    # Training loop
    iter_num = 0
    best_val_loss = 1e9
    t0 = time.time()

    while iter_num <= MAX_ITERS:
        lr = get_lr(iter_num, LEARNING_RATE, WARMUP_ITERS, LR_DECAY_ITERS, MIN_LR) if DECAY_LR else LEARNING_RATE
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        if iter_num % EVAL_INTERVAL == 0 and master_process:
            losses = estimate_loss(model, dataloader, EVAL_ITERS, device, ctx)
            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            if losses['val'] < best_val_loss or ALWAYS_SAVE_CHECKPOINT:
                best_val_loss = losses['val']
                if iter_num > 0:
                    checkpoint = {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'model_args': model_args,
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss
                    }
                    print(f"saving checkpoint to {OUT_DIR}")
                    torch.save(checkpoint, os.path.join(OUT_DIR, 'ckpt.pt'))

        if iter_num == 0 and EVAL_ONLY:
            break

        with tqdm(total=gradient_accumulation_steps, desc=f"Training step {iter_num}") as pbar:
            total_train_loss = 0.0
            for micro_step in range(gradient_accumulation_steps):
                batch = next(iter(dataloader))
                X, Y = batch['input_ids'].to(device), batch['targets'].to(device)
                with ctx:
                    logits, loss = model(X, Y)
                    loss = loss / gradient_accumulation_steps
                scaler.scale(loss).backward()
                pbar.update(1)

        if GRAD_CLIP != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        t1 = time.time()
        dt = t1 - t0
        t0 = t1
        if iter_num % LOG_INTERVAL == 0 and master_process:
            print(f"iter {iter_num}: loss {loss.item() * gradient_accumulation_steps:.4f}, time {dt * 1000:.2f}ms")

        iter_num += 1

    if ddp:
        dist.destroy_process_group()

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--data_path DATA_PATH]
                             [--batch_size BATCH_SIZE]
                             [--block_size BLOCK_SIZE] [--n_layer N_LAYER]
                             [--n_head N_HEAD] [--n_embd N_EMBD]
                             [--dropout DROPOUT]
                             [--learning_rate LEARNING_RATE]
                             [--max_iters MAX_ITERS] [--grad_clip GRAD_CLIP]
                             [--eval_interval EVAL_INTERVAL]
                             [--eval_iters EVAL_ITERS]
                             [--log_interval LOG_INTERVAL]
                             [--init_from INIT_FROM] [--out_dir OUT_DIR]
                             [--backend BACKEND] [--device DEVICE]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/nielspace/Library/Jupyter/runtime/kernel-v2-83046Y7ecPvgzXVVZ.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
