In [1]:
import transformers
from transformers import GPT2LMHeadModel
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

  from .autonotebook import tqdm as notebook_tqdm


transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [9]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size).view(1, 1, config.block_size, config.block_size)))
    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1,2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1,2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1,2)

        # att = torch.einsum("bhxc, bhyc -> bhxy", q, k)*(1.0/np.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:,:,:T, :T] == 0, float("-inf"))
        # att = F.softmax(att, dim=-1)
        # y = torch.einsum("bhax, bhxc -> bhac", att, v)
        
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y
    

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(nn.Module):
    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe= nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
            )
        )
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.lm_head.weight = self.transformer.wte.weight # weight sharing
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, "NANOGPT_SCALE_INIT"):
                std *= (2*self.config.n_layer)**-0.5
            
            torch.nn.init.normal_(module.weight, mean=0.0, std=std) # We can modify that to take into account the size of the layers more precisely.
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        """
        input:
            idx: (B, T)
        """
        B, T = idx.size()
        assert T <= self.config.block_size
        pos = torch.arange(T, device=idx.device)
        pos_embed = self.transformer.wpe(pos) # shape (T, C)
        tok_embed = self.transformer.wte(idx) # shape (B, T, C)
        x = tok_embed + pos_embed # By broadcasting, we obtain shape (B, T, C). The same pos_embed tensor of shape (T, C) is added to each batch.

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # logits has shape (B, T, V): probability distribution over the vocabulary (V) for each token (T) in the sequence for each batch (B).
        if targets is not None:
            # Turn the dimensions (B, T, V) into (B*T, V) and (B, T) into (B*T) to compute the cross-entropy loss.
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) # cross-entropy takes unnormalized logits and ground truth targets (not one-hot encoded, but just the indices of the correct tokens)
        else:
            loss = None
        return logits, loss


In [10]:
model = GPT(GPTConfig())
model.to("cuda:0")
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [12]:
import tiktoken
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T
    
        with open('input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding("gpt2")
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"loaded {len(tokens)} tokens")
        print(f"number of epochs = {len(tokens)//(B*T)}")

        self.current_position = 0
    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position:self.current_position + B*T + 1]
        y = buf[1:].view(B, T)
        x = buf[:-1].view(B, T)
        self.current_position += B*T
        if self.current_position + (B*T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

In [13]:
train_loader = DataLoaderLite(B=4, T=1024)

loaded 338025 tokens
number of epochs = 82


In [14]:
model = torch.compile(model, mode="max-autotune")

In [15]:
import time
B = 8
T = 1024
torch.set_float32_matmul_precision("high")
train_loader = DataLoaderLite(B=B, T=T)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(50):
    t0 = time.perf_counter()
    x, y = train_loader.next_batch()
    x, y = x.to("cuda:0"), y.to("cuda:0")
    optimizer.zero_grad()
    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
        logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    torch.cuda.synchronize() # Necessary to measure time accurately. Otherwise, the time will be taken asynchronuously before the end of the training step on GPU.
    t1 = time.perf_counter()
    dt = (t1 - t0)
    tokens_per_second = (B*T)/dt
    print(f"tokens per second = {tokens_per_second:.0f}")

loaded 338025 tokens
number of epochs = 41


W1002 10:43:00.293000 138269147731136 torch/_inductor/select_algorithm.py:1469] [0/0] out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.
W1002 10:43:00.687000 138269147731136 torch/_inductor/select_algorithm.py:1469] [0/0] out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.
W1002 10:43:01.296000 138269147731136 torch/_inductor/select_algorithm.py:1469] [0/0] out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.
AUTOTUNE addmm(8192x3072, 8192x768, 768x3072)
  triton_mm_54 0.2345 ms 100.0%
  triton_mm_47 0.2447 ms 95.8%
  triton_mm_49 0.2476 ms 94.7%
  triton_mm_48 0.2488 ms 94.2%
  triton_mm_51 0.2499 ms 93.8%
  triton_mm_52 0.2521 ms 93.0%
  triton_mm_45 0.2540 ms 92.3%
  bias_addmm 0.2580 ms 90.9%
  triton_mm_44 0.2632 ms 89.1%
  triton_mm_55 0.2693 ms 87.1%
SingleProcess AUTOTUNE b

In [None]:
(sd_hf["transformer.wte.weight"]==sd_hf["lm_head.weight"]).all()

In [None]:
sd_hf["transformer.wte.weight"].shape

In [None]:
sd_hf["transformer.wte.weight"].data_ptr() == sd_hf["lm_head.weight"].data_ptr()

In [2]:
import tiktoken
import torch
enc = tiktoken.get_encoding("gpt2")
bill_tokens = enc.encode("Bill")
bill_tensors = torch.tensor(bill_tokens)
bill_encoding = sd_hf["transformer.wte.weight"][bill_tensors].squeeze(0)
bill_decoding = torch.einsum("c, tc -> t", bill_encoding, sd_hf["lm_head.weight"])
max_logit = bill_decoding.argmax().item()
# decode the token
enc.decode([max_logit])

'Bill'

In [42]:
sd_hf["transformer.wte.weight"].shape

torch.Size([50257, 768])

torch.Size([50257])

In [4]:
mat = sd_hf["transformer.wte.weight"].to("cuda:0")
mat = mat/mat.norm(dim=-1, keepdim=True)

In [5]:
prod = torch.matmul(mat, mat.T).half()

In [6]:
prod_minus_diag = prod - torch.eye(prod.size(0), device=prod.device).half()

In [9]:
mean_dot_product = prod_minus_diag.mean()*(prod.size(0)/(prod.size(0)-1))

In [10]:
mean_dot_product

tensor(0.2681, device='cuda:0', dtype=torch.float16)

In [21]:
prod_minus_diag.max(), prod_minus_diag.min() # Bizarre, pas de vecteurs avec un produit scalaire trop négatif.

(tensor(0.9976, device='cuda:0', dtype=torch.float16),
 tensor(-0.1228, device='cuda:0', dtype=torch.float16))

In [23]:
prod_minus_diag.shape

torch.Size([50257, 50257])

In [25]:
element = prod_minus_diag.argmax()

tensor(6231993, device='cuda:0')

Bizarre : les embeddings ne sont pas du tout aléatoires, puisque le produit scalaire moyen de deux vecteurs est > 0

on remarque que mat.sum est positif

In [14]:
mat.sum()

tensor(3346.8599, device='cuda:0')

Est-ce que cela peut être dû au hasard? la matrice est initialisée avec des poids $ \mathcal{N}(0, 1)$ d'après le site de PyTorch

In [17]:
# La variance de la somme des poids de la matrice devrait être
var_sum_theory = mat.numel()
std_theory = var_sum_theory**0.5
print(std_theory)
# on en déduit qu'il est possible que la somme de la matrice soit 3346

6212.67864934281


Mesurer la différence de performance entre différentes implémentations d'attention. Karpathy dit que l'implémentation avec les matrices QKV concaténées est plus efficace.

On peut faire des ablations avec les techniques récentes sur les LLMs (voir llama 3, etc... pour voir si on peut améliorer GPT2-small)