In [1]:
import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import transformers
import tiktoken

import math

from tqdm import tqdm

import os
import time

In [2]:
input_file = 'tiny_shakespeare.txt'
data_dir = os.path.join(os.getcwd(), 'data')
input_file_path = os.path.join(data_dir, input_file)
with open(input_file_path, 'r') as f:
    data = f.read()

In [3]:
class TextDataset(Dataset):
    def __init__(self, data, model="gpt2", seq_length=400):
        tokenizer = tiktoken.get_encoding(model)
        self.tokens = tokenizer.encode(data)
        self.seq_length = seq_length
        
        self.x, self.y = self.create_sequences()

    def create_sequences(self):
        x, y = [], []
        for i in range(0, len(self.tokens) - self.seq_length, self.seq_length):
            x.append(self.tokens[i:i+self.seq_length])
            y.append(self.tokens[i+1:i+1+self.seq_length])
        return x, y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        input_seq = torch.tensor(self.x[idx], dtype=torch.long)
        target_seq = torch.tensor(self.y[idx], dtype=torch.long)
        sample = {'input': input_seq, 'target': target_seq}
        return sample

In [4]:
def pad_sequences(batch):
    input_seqs = [item['input'] for item in batch]
    target_seqs = [item['target'] for item in batch]

    input_padded = torch.nn.utils.rnn.pad_sequence(input_seqs, batch_first=True, padding_value=0)
    target_padded = torch.nn.utils.rnn.pad_sequence(target_seqs, batch_first=True, padding_value=0)

    return {'input': input_padded, 'target': target_padded}

In [5]:
dataset = TextDataset(data)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_sequences)

In [6]:
dataset.tokens

[5962,
 22307,
 25,
 198,
 8421,
 356,
 5120,
 597,
 2252,
 11,
 3285,
 502,
 2740,
 13,
 198,
 198,
 3237,
 25,
 198,
 5248,
 461,
 11,
 2740,
 13,
 198,
 198,
 5962,
 22307,
 25,
 198,
 1639,
 389,
 477,
 12939,
 2138,
 284,
 4656,
 621,
 284,
 1145,
 680,
 30,
 198,
 198,
 3237,
 25,
 198,
 4965,
 5634,
 13,
 12939,
 13,
 198,
 198,
 5962,
 22307,
 25,
 198,
 5962,
 11,
 345,
 760,
 327,
 1872,
 385,
 1526,
 28599,
 318,
 4039,
 4472,
 284,
 262,
 661,
 13,
 198,
 198,
 3237,
 25,
 198,
 1135,
 760,
 470,
 11,
 356,
 760,
 470,
 13,
 198,
 198,
 5962,
 22307,
 25,
 198,
 5756,
 514,
 1494,
 683,
 11,
 290,
 356,
 1183,
 423,
 11676,
 379,
 674,
 898,
 2756,
 13,
 198,
 3792,
 470,
 257,
 15593,
 30,
 198,
 198,
 3237,
 25,
 198,
 2949,
 517,
 3375,
 319,
 470,
 26,
 1309,
 340,
 307,
 1760,
 25,
 1497,
 11,
 1497,
 0,
 198,
 198,
 12211,
 22307,
 25,
 198,
 3198,
 1573,
 11,
 922,
 4290,
 13,
 198,
 198,
 5962,
 22307,
 25,
 198,
 1135,
 389,
 17830,
 3595,
 4290,
 11,
 262,
 1458,


# Transformers

In [6]:
#configuration
class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd, dropout, bias=True):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias

In [7]:
#attention
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0

        self.n_head = config.n_head
        self.d_k = config.n_embd // config.n_head
        self.scale = self.d_k ** -0.5

        self.qkv_proj = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.out_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv_proj(x).reshape(B, T, 3, self.n_head, self.d_k).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn_scores = (q @ k.transpose(-2, -1)) * self.scale
        attn_scores = attn_scores.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.attn_dropout(attn_probs)

        attn_output = (attn_probs @ v).transpose(1, 2).reshape(B, T, C)
        attn_output = self.resid_dropout(self.out_proj(attn_output))
        return attn_output

In [8]:
#transformer block
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [9]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None, "Config must include vocab_size"
        assert config.block_size is not None, "Config must include block_size"
        self.config = config

        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'wpe': nn.Embedding(config.block_size, config.n_embd),
            'drop': nn.Dropout(config.dropout),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embd, eps=1e-5),
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.tie_weights()

        self.apply(self._init_weights)
        self.init_residuals()
        print(f"Number of parameters: {self.get_num_params()/1e6:.2f}M")

    def tie_weights(self):
        self.transformer.wte.weight = self.lm_head.weight

    def init_residuals(self):
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Sequence length {t} exceeds block size {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            return logits, loss
        else:
            logits = logits[:, [-1], :]  # Use only the last token's logits
            return logits, None

    def crop_block_size(self, block_size):
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:, :, :block_size, :block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}, "Invalid model type"
        override_args = override_args or {}
        assert all(k == 'dropout' for k in override_args), "Only 'dropout' can be overridden"

        print(f"Loading weights from pretrained model: {model_type}")

        config_args = cls.get_config_args(model_type)
        if 'dropout' in override_args:
            print(f"Overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']

        config = GPTConfig(**config_args)
        model = GPT(config)
        model.load_pretrained_weights(model_type)
        return model

    @staticmethod
    def get_config_args(model_type):
        config_map = {
            'gpt2': {'n_layer': 12, 'n_head': 12, 'n_embd': 768},
            'gpt2-medium': {'n_layer': 24, 'n_head': 16, 'n_embd': 1024},
            'gpt2-large': {'n_layer': 36, 'n_head': 20, 'n_embd': 1280},
            'gpt2-xl': {'n_layer': 48, 'n_head': 25, 'n_embd': 1600},
        }
        config_args = config_map[model_type]
        config_args.update({'vocab_size': 50257, 'block_size': 1024, 'bias': True, 'dropout': 0.1})
        return config_args

    def load_pretrained_weights(self, model_type):
        model_hf = transformers.GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        sd = self.state_dict()
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        for k, v in sd_hf.items():
            if k in sd:
                if any(k.endswith(w) for w in transposed):
                    sd[k].copy_(v.t())
                else:
                    sd[k].copy_(v)

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        N = self.get_num_params()
        L, H, Q, T = self.config.n_layer, self.config.n_head, self.config.n_embd // self.config.n_head, self.config.block_size
        flops_per_token = 6 * N + 12 * L * H * Q * T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        flops_achieved = flops_per_iter * (1.0 / dt)
        flops_promised = 312e12
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


In [10]:
config = GPTConfig(
    vocab_size=50257, 
    block_size=1024, 
    n_layer=12, 
    n_head=12, 
    n_embd=768, 
    dropout=0.1, 
    bias=True)

In [11]:
customGPT = GPT(config)

Number of parameters: 123.65M


# Training

In [12]:
#training dependencies
import numpy as np
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
from contextlib import nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group

In [1]:
# Default config values designed to train a GPT-2 (124M) on OpenWebText
# I/O
out_dir = 'out'
eval_interval = 2000
log_interval = 1
eval_iters = 200
eval_only = False
always_save_checkpoint = True
init_from = 'scratch'  # 'scratch' or 'resume' or 'gpt2*'

# Data
# data_path = 'data/openwebtext.txt'  # Path to your text file
gradient_accumulation_steps = 5 * 8
batch_size = 12
block_size = 1024

# Model
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.0
bias = False

# AdamW optimizer
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0

# Learning rate decay settings
decay_lr = True
warmup_iters = 2000
lr_decay_iters = 600000
min_lr = 6e-5

# DDP settings
backend = 'gloo'

# System
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
dtype = 'float32'  # MPS currently supports only float32
compile = False  # Disable compilation for now as PyTorch 2.0 is not yet stable on MPS
# -----------------------------------------------------------------------------

# Collect configuration keys
config_keys = [k for k, v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys}

# Various initializations and derived attributes
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'mps:{ddp_local_rank}' if torch.backends.mps.is_available() else f'cuda:{ddp_local_rank}'
    torch.mps.set_device(device)
    master_process = ddp_rank == 0
    seed_offset = ddp_rank
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    master_process = True
    seed_offset = 0
    ddp_world_size = 1

tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"Tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(out_dir, exist_ok=True)

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'mps' if 'mps' in device else 'cpu'
ptdtype = torch.float32  # Currently, MPS supports only float32
ctx = nullcontext()

NameError: name 'torch' is not defined

In [2]:
# Initialize iteration number and best validation loss
iter_num = 0
best_val_loss = 1e9

# Model initialization
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=len(dataset.tokens), dropout=dropout)

if init_from == 'scratch':
    print("Initializing a new model from scratch")
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    print(f"Resuming training from {out_dir}")
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    override_args = dict(dropout=dropout)
    model = GPT.from_pretrained(init_from, override_args)
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)

if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size

model.to(device)

NameError: name 'dataset' is not defined

In [15]:
# Initialize a GradScaler
scaler = GradScaler(enabled=(dtype == 'float16' and 'cuda' in device))

# Configure the optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None  # Free up memory

# Wrap model into DDP container if needed
if ddp:
    print(f"Starting parallel process with rank {ddp_rank}")
    model = DDP(model, device_ids=[ddp_local_rank])


# Function to estimate loss over splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        with tqdm(total=eval_iters, desc=f"Evaluating {split}") as pbar:
            for k in range(eval_iters):
                batch = next(iter(dataloader))
                X, Y = batch['input'].to(device), batch['target'].to(device)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
                pbar.update(1)
        out[split] = losses.mean()
    model.train()
    return out

# Learning rate decay scheduler
def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [16]:
raw_model = model.module if ddp else model
local_iter_num = 0
running_mfu = -1.0
t0 = time.time()

while iter_num <= max_iters:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break

    with tqdm(total=gradient_accumulation_steps, desc=f"Training step {iter_num}") as pbar:
        for micro_step in range(gradient_accumulation_steps):
            if ddp:
                model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
            batch = next(iter(dataloader))
            X, Y = batch['input'].to(device), batch['target'].to(device)
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / gradient_accumulation_steps
            scaler.scale(loss).backward()
            pbar.update(1)

    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt * 1000:.2f}ms, mfu {running_mfu * 100:.2f}%")
    iter_num += 1
    local_iter_num += 1

if ddp:
    destroy_process_group()

Evaluating train: 100%|██████████| 200/200 [04:33<00:00,  1.37s/it]
Evaluating val: 100%|██████████| 200/200 [04:57<00:00,  1.49s/it]


step 0: train loss 12.9407, val loss 12.9414


Training step 0: 100%|██████████| 40/40 [18:01<00:00, 27.04s/it]


iter 0: loss 12.9594, time 1656955.16ms, mfu -100.00%


Training step 1:   0%|          | 0/40 [00:02<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 15.47 GB, other allocations: 1006.20 MB, max allowed: 18.13 GB). Tried to allocate 2.01 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).