- This file contains the code for creating the encoder and decoder -- ViT and GPT2
- The weights are copied from the pretrained models

In [4]:
# GPT2 config
# the number of layers is 12
# the embedding size is 768

# to explore the keys and the values of the weights

from transformers import GPT2LMHeadModel

gpt2 = GPT2LMHeadModel.from_pretrained("gpt2") # 124M

# get the state dicts
model_state_dict = gpt2.state_dict()

# print them 
for k, v in model_state_dict.items():
    print(k,v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [8]:
import torch
import torch.nn as nn
from dataclasses import dataclass
import math
from torch.nn import functional as F

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257  # GPT-2 vocab size
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

class MLP(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.gelu(self.c_fc(x))
        x = self.c_proj(x)
        return x

class CasualSelfAttention(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()  # <-- FIX: Added super().__init__()
        assert config.n_embd % config.n_head == 0
        # Q, K, V projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # causal mask
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # attention (materializes the large (T,T) matrix for all queries and keys)
        attn = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        attn = attn.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        
        y = attn @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        
        # output projection
        y = self.c_proj(y)
        return y

class Block(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()  # <-- FIX: Added super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # --- REFINEMENT: Weight Tying ---
        self.transformer.wte.weight = self.lm_head.weight

    def forward(self, idx, targets=None):
        # <-- FIX: Added the entire forward method -->
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"

        # forward the token and pos embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb

        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # if we are given some desired targets, also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        
        return logits, loss
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (B,T)) and complete
        the sequence B times, each of length max_new_tokens.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long, crop it
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [9]:
from transformers import GPT2LMHeadModel

def load_weights_from_hf(my_model: GPT, hf_model: GPT2LMHeadModel):
    """
    Loads weights from a Hugging Face GPT2LMHeadModel into our custom GPT model.
    """
    my_sd = my_model.state_dict()
    hf_sd = hf_model.state_dict()

    # The weights in Hugging Face's conv1d are transposed compared to nn.Linear
    # We need to transpose them back
    transposed_keys = ['c_attn.weight', 'c_proj.weight', 'c_fc.weight']

    print("Copying weights...")
    for key in my_sd:
        # The Hugging Face model has a "transformer." prefix for its layers,
        # and our lm_head is not inside the transformer block.
        if key.startswith("transformer."):
            hf_key = key
        else: # it's the lm_head
            hf_key = "transformer." + key

        # Special case for the lm_head, which is outside the 'transformer' block in HF model
        if 'lm_head.weight' in key:
            hf_key = 'lm_head.weight'
            
        if hf_key not in hf_sd:
            print(f"Skipping {key}, not found in Hugging Face model.")
            continue

        # Check if this weight needs to be transposed
        needs_transpose = any(tk in key for tk in transposed_keys)
        
        if needs_transpose:
            print(f"Copying and transposing: {key} <-- {hf_key}")
            my_sd[key].copy_(hf_sd[hf_key].T)
        else:
            print(f"Copying directly:      {key} <-- {hf_key}")
            my_sd[key].copy_(hf_sd[hf_key])
            
    # Load the modified state dict into our model
    my_model.load_state_dict(my_sd)
    print("Weight copy complete.")

In [10]:
# main_demo.py
from transformers import AutoTokenizer

if __name__ == '__main__':
    # --- 1. Setup Models and Tokenizer ---
    print("Setting up models and tokenizer...")
    
    # Instantiate our custom model
    config = GPTConfig()
    my_gpt = GPT(config)
    my_gpt.eval() # Set to evaluation mode

    # Load the official Hugging Face model and tokenizer
    hf_model_name = 'gpt2'
    hf_gpt = GPT2LMHeadModel.from_pretrained(hf_model_name)
    tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

    # --- 2. Copy Weights ---
    load_weights_from_hf(my_gpt, hf_gpt)

    # --- 3. Generate Tokens ---
    print("\n--- Generating Text with Your Custom GPT Model ---")
    prompt = "Hello, I am a language model,"
    
    # Encode the prompt into token IDs
    start_ids = tokenizer.encode(prompt, return_tensors='pt') # Get PyTorch tensors
    
    # Generate text
    print(f"Prompt: '{prompt}'")
    generated_ids = my_gpt.generate(
        idx=start_ids, 
        max_new_tokens=50, 
        top_k=50
    )

    # Decode the generated token IDs back to a string
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print("\nGenerated Text:")
    print("-" * 50)
    print(generated_text)
    print("-" * 50)

    # --- Verification (Optional): Generate with HF model to compare ---
    print("\n--- Generating Text with Hugging Face Model for Comparison ---")
    hf_generated_ids = hf_gpt.generate(
        start_ids,
        max_length=len(start_ids[0]) + 50,
        top_k=50,
        do_sample=True # Important to make it stochastic like ours
    )
    hf_generated_text = tokenizer.decode(hf_generated_ids[0], skip_special_tokens=True)
    print("\nGenerated Text (Hugging Face):")
    print("-" * 50)
    print(hf_generated_text)
    print("-" * 50)

Setting up models and tokenizer...
Copying weights...
Copying directly:      transformer.wte.weight <-- transformer.wte.weight
Copying directly:      transformer.wpe.weight <-- transformer.wpe.weight
Copying directly:      transformer.h.0.ln_1.weight <-- transformer.h.0.ln_1.weight
Copying directly:      transformer.h.0.ln_1.bias <-- transformer.h.0.ln_1.bias
Skipping transformer.h.0.attn.bias, not found in Hugging Face model.
Copying and transposing: transformer.h.0.attn.c_attn.weight <-- transformer.h.0.attn.c_attn.weight
Copying directly:      transformer.h.0.attn.c_attn.bias <-- transformer.h.0.attn.c_attn.bias
Copying and transposing: transformer.h.0.attn.c_proj.weight <-- transformer.h.0.attn.c_proj.weight
Copying directly:      transformer.h.0.attn.c_proj.bias <-- transformer.h.0.attn.c_proj.bias
Copying directly:      transformer.h.0.ln_2.weight <-- transformer.h.0.ln_2.weight
Copying directly:      transformer.h.0.ln_2.bias <-- transformer.h.0.ln_2.bias
Copying and transposing

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Text:
--------------------------------------------------
Hello, I am a language model, not a concept. The only thing I have to worry about in the future are your ideas and when to keep changing it will not work out. I am ready for you to work for it."

A few days later, on April 20,
--------------------------------------------------

--- Generating Text with Hugging Face Model for Comparison ---

Generated Text (Hugging Face):
--------------------------------------------------
Hello, I am a language model, I've been learning it for some time and I think that's where it all started and it's what's really exciting to me is my language and I'm happy for that. I'd love to share more and you see how I'll approach it
--------------------------------------------------
