In [14]:
pip install tiktoken



In [4]:
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B  # batch size
        self.T = T  # sequence length

        # Load tokens from disk and store them in memory
        with open('input.txt', 'r') as f:
            text = f.read()

        # Initialize tokenizer
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f'Loaded {len(self.tokens):,} tokens')
        print(f'1 epoch = {len(self.tokens) // (B * T):,} batches')

        # Initialize position
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T

        # Get the next batch of tokens
        buf = self.tokens[self.current_position:self.current_position + B*T + 1]

        # Reset position if we're at the end
        if len(buf) < B*T + 1:
            self.current_position = 0
            buf = self.tokens[:B*T + 1]

        # Create input and target tensors
        x = buf[:-1].view(B, T)
        y = buf[1:].view(B, T)

        # Update position
        self.current_position += B*T

        return x, y


In [16]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken

In [6]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True) # Flash attention

        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [7]:
class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [8]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [9]:
@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension
    dropout: float = 0.1  # Added dropout parameter


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing
        self.transformer.wte.weight = self.lm_head.weight

        # weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)



    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"

        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer


In [10]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

# Optimize training configuration for Colab
def get_training_config(device):
    if device == 'cuda':  # Colab GPU configuration
        block_size = 512  # Reduced from 1024 for Colab memory
        return {
            'batch_size': 8,            # Reduced batch size for Colab GPU
            'seq_length': block_size,
            'max_lr': 3e-4,             # Slightly reduced learning rate
            'min_lr': 1e-5,
            'warmup_steps': 1000,
            'max_steps': 100000,
            'gradient_accumulation_steps': 4,  # Increased for effective batch size
            'log_interval': 50,
            'model_config': GPTConfig(
                block_size=block_size,
                n_layer=8,              # Reduced from 12
                n_head=8,              # Reduced from 12
                n_embd=512,            # Reduced from 768
                dropout=0.1
            )
        }
    elif device == 'mps':
        return {
            'batch_size': 4,
            'seq_length': 128,
            'max_lr': 5e-4,
            'min_lr': 1e-5,
            'warmup_steps': 1000,
            'max_steps': 100000,
            'gradient_accumulation_steps': 8,
            'log_interval': 50,
            'model_config': GPTConfig(
                block_size=256,
                n_layer=6,
                n_head=8,
                n_embd=384,
                dropout=0.1
            )
        }
    else:  # CPU configuration
        return {
            'batch_size': 4,
            'seq_length': 256,
            'max_lr': 1e-4,
            'min_lr': 1e-5,
            'warmup_steps': 1000,
            'max_steps': 100000,
            'gradient_accumulation_steps': 8,
            'log_interval': 50,
            'model_config': GPTConfig(
                block_size=256,
                n_layer=6,
                n_head=6,
                n_embd=384,
                dropout=0.1
            )
        }


using device: cuda


In [11]:
# Optimized learning rate scheduler
class CosineWarmupScheduler:
    def __init__(self, max_lr, min_lr, warmup_steps, max_steps):
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps

    def get_lr(self, step):
        if step < self.warmup_steps:
            return self.max_lr * (step + 1) / self.warmup_steps
        if step > self.max_steps:
            return self.min_lr
        decay_ratio = (step - self.warmup_steps) / (self.max_steps - self.warmup_steps)
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return self.min_lr + coeff * (self.max_lr - self.min_lr)

# Add model saving function
def save_checkpoint(model, optimizer, config, step, loss, best_loss, save_path):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'config': config,
        'step': step,
        'loss': loss,
        'best_loss': best_loss
    }
    torch.save(checkpoint, save_path)
    #print(f"Checkpoint saved: {save_path}")

In [12]:
# Simplify train_model function
def train_model(model, train_loader, config, device):
    optimizer = model.configure_optimizers(
        weight_decay=0.1,
        learning_rate=config['max_lr'],
        device_type=device
    )

    scheduler = CosineWarmupScheduler(
        max_lr=config['max_lr'],
        min_lr=config['min_lr'],
        warmup_steps=config['warmup_steps'],
        max_steps=config['max_steps']
    )

    model.train()
    total_tokens = 0
    best_loss = float('inf')

    # Create checkpoints directory if it doesn't exist
    os.makedirs('checkpoints', exist_ok=True)

    print("\n=== Starting Training ===")
    print(f"Training for {config['max_steps']:,} steps")
    print(f"Logging every {config['log_interval']} steps\n")

    for step in range(config['max_steps']):
        t0 = time.time()
        optimizer.zero_grad(set_to_none=True)
        accumulated_loss = 0

        for micro_step in range(config['gradient_accumulation_steps']):
            x, y = train_loader.next_batch()
            x, y = x.to(device), y.to(device)

            if device == 'mps':
                logits, loss = model(x, y)
            else:
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    logits, loss = model(x, y)

            loss = loss / config['gradient_accumulation_steps']
            loss.backward()
            accumulated_loss += float(loss.detach().cpu().item())

            del logits, loss
            if device == 'mps':
                torch.mps.empty_cache()

        # Gradient clipping
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Learning rate update
        lr = scheduler.get_lr(step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()

        # Modified logging frequency
        if step % config['log_interval'] == 0:
            elapsed = time.time() - t0
            print(f"Step {step:6d}/{config['max_steps']:6d} | "
                  f"Loss: {accumulated_loss:.4f} | "
                  f"LR: {lr:.2e} | "
                  f"Best: {best_loss:.4f} | "
                  f"Time: {elapsed:.2f}s")

        # Track best loss
        if accumulated_loss < best_loss:
            best_loss = accumulated_loss

        if accumulated_loss < 0.099999:
            print(f"\n🎉 Target loss achieved at step {step:,}!")
            save_checkpoint(
                model=model,
                optimizer=optimizer,
                config=config,
                step=step,
                loss=accumulated_loss,
                best_loss=best_loss,
                save_path=f'checkpoints/target_achieved_model.pt'
            )
            break

    print("\n=== Training Complete ===")
    print(f"Best Loss: {best_loss:.6f}")
    print(f"Total Steps: {step + 1:,}")
    return model

# Add load checkpoint function
def load_checkpoint(path, device):
    checkpoint = torch.load(path, map_location=device)
    config = checkpoint['config']
    model = GPT(config['model_config']).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])

    optimizer = model.configure_optimizers(
        weight_decay=0.1,
        learning_rate=config['max_lr'],
        device_type=device
    )
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    return model, optimizer, config, checkpoint['step'], checkpoint['loss'], checkpoint['best_loss']

# Add detailed logging function
def log_config(config, model, device):
    print("\n=== Training Configuration ===")
    print(f"Device: {device}")
    print("\nModel Architecture:")
    print(f"- Layers: {config['model_config'].n_layer}")
    print(f"- Heads: {config['model_config'].n_head}")
    print(f"- Embedding Dim: {config['model_config'].n_embd}")
    print(f"- Block Size: {config['model_config'].block_size}")
    print(f"- Vocab Size: {config['model_config'].vocab_size}")

    print("\nTraining Parameters:")
    print(f"- Batch Size: {config['batch_size']}")
    print(f"- Sequence Length: {config['seq_length']}")
    print(f"- Gradient Accumulation Steps: {config['gradient_accumulation_steps']}")
    print(f"- Effective Batch Size: {config['batch_size'] * config['gradient_accumulation_steps']}")
    print(f"- Learning Rate: {config['max_lr']}")

    # Calculate model size
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    total_size = param_size + buffer_size

    print("\nModel Statistics:")
    print(f"- Total Parameters: {total_params:,}")
    print(f"- Trainable Parameters: {trainable_params:,}")
    print(f"- Model Size: {total_size/1024/1024:.2f} MB")
    print("="*30 + "\n")

In [17]:
if __name__ == "__main__":
    # Get configuration and initialize model
    config = get_training_config(device)
    model = GPT(config['model_config']).to(device)

    # Log detailed configuration
    log_config(config, model, device)

    # Initialize data loader
    train_loader = DataLoaderLite(
        B=config['batch_size'],
        T=config['seq_length']
    )

    # Train model
    torch.set_float32_matmul_precision('high')
    model = train_model(model, train_loader, config, device)


=== Training Configuration ===
Device: cuda

Model Architecture:
- Layers: 8
- Heads: 8
- Embedding Dim: 512
- Block Size: 512
- Vocab Size: 50304

Training Parameters:
- Batch Size: 8
- Sequence Length: 512
- Gradient Accumulation Steps: 4
- Effective Batch Size: 32
- Learning Rate: 0.0003

Model Statistics:
- Total Parameters: 51,237,888
- Trainable Parameters: 51,237,888
- Model Size: 203.46 MB

Loaded 338,025 tokens
1 epoch = 82 batches
num decayed parameter tensors: 34, with 51,183,616 parameters
num non-decayed parameter tensors: 66, with 54,272 parameters
using fused AdamW: True

=== Starting Training ===
Training for 100,000 steps
Logging every 50 steps

Step      0/100000 | Loss: 10.9322 | LR: 3.00e-07 | Best: inf | Time: 3.96s
Step     50/100000 | Loss: 9.3276 | LR: 1.53e-05 | Best: 9.3480 | Time: 2.35s
Step    100/100000 | Loss: 8.3049 | LR: 3.03e-05 | Best: 8.3672 | Time: 2.41s
Step    150/100000 | Loss: 7.3769 | LR: 4.53e-05 | Best: 7.1525 | Time: 2.42s
Step    200/100000