In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from hellaswag import render_example, iterate_examples
from dataclasses import dataclass
import os



class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


def get_most_likely_row(tokens, mask, logits):
    # evaluate the autoregressive loss at all positions
    shift_logits = (logits[..., :-1, :]).contiguous()
    shift_tokens = (tokens[..., 1:]).contiguous()
    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    flat_shift_tokens = shift_tokens.view(-1)
    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
    shift_losses = shift_losses.view(tokens.size(0), -1)
    # now get the average loss just for the completion region (where mask == 1), in each row
    shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
    masked_shift_losses = shift_losses * shift_mask
    # sum and divide by the number of 1s in the mask
    sum_loss = masked_shift_losses.sum(dim=1)
    avg_loss = sum_loss / shift_mask.sum(dim=1)
    # now we have a loss for each of the 4 completions
    # the one with the lowest loss should be the most likely
    pred_norm = avg_loss.argmin().item()
    return pred_norm


In [4]:
from tqdm import tqdm

# Move model to appropriate device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Load the checkpoint
checkpoint_path = "log/model_10000.pt"  # Adjust this path as needed
checkpoint = torch.load(checkpoint_path, map_location='cpu')
# Create and load the model
config = checkpoint['config']
model = GPT(config)

# Remove the "_orig_mod" prefix from the state dict keys
state_dict = checkpoint['model']
new_state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}

# Load the modified state dict
model.load_state_dict(new_state_dict)

# model.load_state_dict(checkpoint['model'])

model.to(device)
model.eval()

# Evaluate on HellaSwag
num_correct_norm = 0
num_total = 0

for example in tqdm(iterate_examples("val")):
    _, tokens, mask, label = render_example(example)
    tokens = tokens.to(device)
    mask = mask.to(device)
    
    with torch.no_grad():
        # with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, _ = model(tokens)
        pred_norm = get_most_likely_row(tokens, mask, logits)
    
    num_total += 1
    num_correct_norm += int(pred_norm == label)

acc_norm = num_correct_norm / num_total
print(f"HellaSwag accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")

Using device: mps


  checkpoint = torch.load(checkpoint_path, map_location='cpu')
10042it [17:20,  9.65it/s]

HellaSwag accuracy: 2903/10042=0.2891





In [20]:
names = ['10000', '15000']
for name in names:
    checkpoint_path = f"log/model_{name}.pt"

    print(f"Evaluating {checkpoint_path}")

    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    # Create and load the model
    config = checkpoint['config']
    model = GPT(config)

    # Remove the "_orig_mod" prefix from the state dict keys
    state_dict = checkpoint['model']
    new_state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}

    # Load the modified state dict
    model.load_state_dict(new_state_dict)

    # model.load_state_dict(checkpoint['model'])

    model.to(device)
    model.eval()

    # Evaluate on HellaSwag
    num_correct_norm = 0
    num_total = 0

    for example in tqdm(iterate_examples("val")):
        _, tokens, mask, label = render_example(example)
        tokens = tokens.to(device)
        mask = mask.to(device)
        
        with torch.no_grad():
            # with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, _ = model(tokens)
            pred_norm = get_most_likely_row(tokens, mask, logits)
        
        num_total += 1
        num_correct_norm += int(pred_norm == label)

    acc_norm = num_correct_norm / num_total
    print(f"HellaSwag accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")

Evaluating log/model_05000.pt


  checkpoint = torch.load(checkpoint_path, map_location='cpu')
10042it [21:24,  7.82it/s]


HellaSwag accuracy: 2751/10042=0.2739
Evaluating log/model_10000.pt


1166it [01:18, 16.35it/s]

In [19]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

model.eval()
num_return_sequences = 2
max_length = 128
tokens = enc.encode("Hello, I'm a student who loves playing badminton,")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
xgen = tokens.to(device)
sample_rng = torch.Generator(device=device)
sample_rng.manual_seed(42 + 1)
while xgen.size(1) < max_length:
    # forward the model to get the logits
    with torch.no_grad():
        # with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
        logits, loss = model(xgen) # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        xgen = torch.cat((xgen, xcol), dim=1)
# print the generated text
for i in range(num_return_sequences):
    tokens = xgen[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(f"sample {i}: {decoded}")

sample 0: Hello, I'm a student who loves playing badminton, and I see how it's also great! That's my way of saying I love baseball. If you don't want to go back and play it, I wouldn't say hello for you. That is a fun way of saying I like goodminton.
And with lots of other good lessons, I mean lessons about the badminton. The trick isn't in the way when i'm playing badminton. The problem here is if you have an "at least" good lesson that is going to show me the badminton lessons. If a lesson is
sample 1: Hello, I'm a student who loves playing badminton, but I love playing basketball! When I asked where I play, she answered "it's the game of basketball."
I'm from an elementary school. I play and play basketball! What do I think of that word? Why should we put that in the name of basketball?
In the past, I've been doing other things, including playing badminton. I was introduced to badminton and started my own business, which helped me to learn more about it and the game. I'm still very 