In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap, random

device = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else device

In [2]:
# Hyperparameters
batch_size = 64
block_size = 16
max_iters = 5000
learning_rate = 3e-4
eval_interval = 500
eval_iters = 200
n_embd = 384
n_layer = 6
n_head = 6
dropout = 0.2

In [3]:
# Generate Vocabulary
with open('openwebtext/char_vocab.txt', 'r', encoding='utf-8') as file:
    text = file.read()
chars = sorted(list(set(text)))

vocab_size = len(chars)

In [4]:
# Character-Level Tokenizer
string_to_int = { ch:i for i, ch in enumerate(chars) }
int_to_string = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [ string_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_string[i] for i in l)

In [5]:
# Memory Map Implementation
def get_random_chunk(split):
    filename = "openwebtext/train_split.txt" if split == "train" else "openwebtext/val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - batch_size*block_size)

            mm.seek(start_pos)
            block = mm.read(batch_size*block_size - 1)

            decoded_block = block.decode('utf-8', errors='ignore').replace('/r', '') # convert from bytes (return of mm) to string

            data = torch.tensor(encode(decoded_block), dtype=torch.long) # convert to usable tensors

    return data
    
# Input-Target Parallel Implementation
def get_batch(split):
    data = get_random_chunk(split) # get full batch block thing
    idx = torch.randint(len(data) - block_size, (batch_size,)) # upper bound, 1D tensor of batch_size (i.e. 32 tensors of starting points)
    
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    x, y = x.to(device), y.to(device)
    
    return x, y

In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
class Head(nn.Module):
    """one head of attention"""

    def __init__(self, head_size):
        super().__init__()
        # although same compression, weights randomly initalized -> diff results after training
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # save computation by initializing once

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input size: (B, T, C)
        # output size: (B, T, head_size)
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        
        # compute attention scores/affinities 
        # transpose(-2, -1) swaps second last dim w/ last dim, divides 1/sqrt(head_size) at end; compute (B, T, T) for token dot products
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        # self.tril here calls register_buffer that takes block_size (aka: [:T, :T], 0->T exclusive)
        # curr: tril w/ 1 on diag and below; == 0 creates booleans that are 0, then changes True to -inf
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # changes all values above diag to 0; (B, T, T)
        wei = F.softmax(wei, dim=-1) # normalize all along rows (B, T, T)
        wei = self.dropout(wei) # apply dropout from __init__
        
        # weighted aggregation of values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        return out

In [8]:
class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        # ModuleList does NOT define parallel or sequential, rather the list comprehension & PyTorch does
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd) # more learnable params; note: num_heads*head_size = n_embd
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # head(x) calls the __call__ function which invokes the forward() function
        out = torch.cat([head(x) for head in self.heads], dim=-1) # combine on last dim for all heads -> (B, T, num_heads*head_size)
        out = self.dropout(self.proj(out))
        return out

In [9]:
class FeedForward(nn.Module):
    """simple linear layer followed by nonlinearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x) # apply layers

In [10]:
class Block(nn.Module):
    """Transformer block w/ multihead-attention"""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head # for better compute in parallel
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd) # apply linear + nonlinearity
        self.ln1 = nn.LayerNorm(n_embd) 
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

In [11]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # not parallel compute
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # hidden representations -> vocab logits

        self.apply(self._init_weights) # goes through all layers of neural network

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # normal distrib
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias) # w/ underscore modifies tensor in-place
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)        
    
    def forward(self, index, targets=None):
        B, T = index.shape

        # B = batch size, T = tokens/sequence, C = channel/embedding dim
        # idx and targets are (B, T) tensors
        tok_emb = self.token_embedding_table(index) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # generate positions up to T, size (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # feed into decoding blocks, (B, T, C)
        x = self.ln_f(x) # layer norm (B, T, C)
        logits = self.lm_head(x) # linear to vocab (B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, V = logits.shape # v = vocab_size, probabilities for each vocab word
            logits = logits.view(B * T, V) # compress to (B*T, V) for indiv predictions
            targets = targets.view(B * T) # each value in (B*T) is an index for "correct" word
            loss = F.cross_entropy(logits, targets) # computes softmax for each row of B*T and does -log(index of target)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] # keep accurate context size
            logits, loss = self.forward(idx_cond) # get best pred of next token
            logits = logits[:, -1, :] # last pred, all B, last T, all V -> (B, V)
            probs = F.softmax(logits, dim=-1) # softmax along last dim (V) -> (B, V) but normalized
            idx_next = torch.multinomial(probs, num_samples=1) # sample one token/batch, one pred/batch -> (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # add to pred to idx -> (B, T+1)
        return idx

model = GPTLanguageModel(vocab_size)
m = model.to(device)

In [12]:
# Training Loop
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'step: {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}')
    
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step: 0: train loss 10.4944, val loss 10.4983
step: 500: train loss 2.2251, val loss 2.2543
step: 1000: train loss 2.1088, val loss 2.0992
step: 1500: train loss 2.0458, val loss 2.0427
step: 2000: train loss 1.9942, val loss 1.9630
step: 2500: train loss 1.9859, val loss 1.9635
step: 3000: train loss 1.8906, val loss 1.8843
step: 3500: train loss 1.9922, val loss 1.9795
step: 4000: train loss 1.8998, val loss 1.8604
step: 4500: train loss 1.8572, val loss 1.8683


In [13]:
# Save the model
filename = 'model-v1-01.pt'
torch.save(model.state_dict(), filename)
print("Model saved to", filename)

Model saved to model-v1-01.pt


In [15]:
# Load the model weights/params
model = GPTLanguageModel(vocab_size)

model.load_state_dict(torch.load('model_weights/model-v1-01.pt', weights_only=True))
print("Model loaded successfully!")

m = model.to(device)

Model loaded successfully!


In [None]:
# Run multiple iterations
num_iterations = 5

for iteration in range(num_iterations):
    # Training Loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    for iter in range(max_iters):
        if iter % eval_interval == 0:
            losses = estimate_loss()
            print(f'step: {iter}: train loss {losses["train"]:.4f}, val loss {losses["val"]:.4f}')
        
        xb, yb = get_batch('train')
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    
    # Save model after each full training iteration
    filename = f'model-01-iter{iteration+1}.pt'
    torch.save(model.state_dict(), filename)
    print(f"Model saved to {filename}")

print("All iterations completed!")

In [16]:
# Generate the new tokens
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


While is nob. Sendivey a swirth our roces. Wate.” But the emale point, countrey plose) Boy rap for autors that's misiument. In heard it procies was no amoung for likes has got on annoning is not Connecknobby, pots. 2047 came fore throughld versional continued paticing hourning in augh Her Crible than "us there. Univers plannehing in in 1400. Chebinhe musing has make, one. It call artly -emnerian styles's News poin New potent such weall as could it was least food our comperizanise. I'm mamaging I
