# Loading BioGPT into TransformerLens

## Setup

In [1]:
import os



In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops

from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens import (
    HookedTransformer,
    HookedTransformerConfig,
)

In [6]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x79bd57ac7910>

## BioGPT

In [12]:
MODEL_PATH = "microsoft/biogpt"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
hf_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

config = hf_model.config
config

BioGptConfig {
  "_name_or_path": "microsoft/biogpt",
  "activation_dropout": 0.0,
  "architectures": [
    "BioGptForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "layerdrop": 0.0,
  "max_position_embeddings": 1024,
  "model_type": "biogpt",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "scale_embedding": true,
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vocab_size": 42384
}

In [13]:
state_dict_path = "BioGPT_state_dict.pth"
torch.save(hf_model.state_dict(), state_dict_path)

## Load into TransformerLens

In [14]:
head_dim = config.hidden_size // config.num_attention_heads

hooked_config = HookedTransformerConfig(
    n_layers=config.num_hidden_layers,
    d_model=config.hidden_size,
    d_head=head_dim,
    n_heads=config.num_attention_heads,
    d_mlp=config.intermediate_size,
    d_vocab=config.vocab_size,
    n_ctx=config.max_position_embeddings,
    act_fn=config.hidden_act,
)
model = HookedTransformer(hooked_config)

In [38]:
model.W_E.shape

torch.Size([42384, 1024])

In [33]:
def biogpt_to_transformer_lens_format(in_sd, n_layers, n_heads):
    out_sd = {}
    out_sd["pos_embed.W_pos"] = in_sd[f"biogpt.embed_positions.weight"]
    out_sd["embed.W_E"] = in_sd[f"biogpt.embed_tokens.weight"]

    out_sd["ln_final.w"] = in_sd[f"biogpt.layer_norm.weight"]
    out_sd["ln_final.b"] = in_sd[f"biogpt.layer_norm.bias"]
    out_sd["unembed.W_U"] = in_sd[f"output_projection.weight"].T

    for layer in range(n_layers):
        out_sd[f"blocks.{layer}.ln0.w"] = in_sd[f"biogpt.layers.{layer}.fc1.weight"]
        out_sd[f"blocks.{layer}.ln0.b"] = in_sd[f"biogpt.layers.{layer}.fc1.bias"]
        out_sd[f"blocks.{layer}.ln1.w"] = in_sd[f"biogpt.layers.{layer}.fc2.weight"]
        out_sd[f"blocks.{layer}.ln1.b"] = in_sd[f"biogpt.layers.{layer}.fc2.bias"]


        out_sd[f"blocks.{layer}.attn.W_Q"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.q_proj.weight"],
            "(n_heads d_head) d_model -> n_heads d_model d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.b_Q"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.q_proj.bias"],
            "(n_heads d_head) -> n_heads d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.W_K"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.k_proj.weight"],
            "(n_heads d_head) d_model -> n_heads d_model d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.b_K"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.k_proj.bias"],
            "(n_heads d_head) -> n_heads d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.W_V"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.v_proj.weight"],
            "(n_heads d_head) d_model -> n_heads d_model d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.b_V"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.v_proj.bias"],
            "(n_heads d_head) -> n_heads d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.W_O"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.out_proj.weight"],
            "(d_model n_heads) d_head -> n_heads d_model d_head",
            n_heads=n_heads,
        )
        out_sd[f"blocks.{layer}.attn.b_O"] = einops.rearrange(
            in_sd[f"biogpt.layers.{layer}.self_attn.out_proj.bias"],
            "(d_model n_heads) -> n_heads d_model",
            n_heads=n_heads,
        )

        out_sd[f"blocks.{layer}.mlp.b_in"] = in_sd[f"biogpt.layers.{layer}.fc1.bias"]
        out_sd[f"blocks.{layer}.mlp.W_in"] = in_sd[f"biogpt.layers.{layer}.fc1.weight"].T
        out_sd[f"blocks.{layer}.mlp.b_out"] = in_sd[f"biogpt.layers.{layer}.fc2.bias"]
        out_sd[f"blocks.{layer}.mlp.W_out"] = in_sd[f"biogpt.layers.{layer}.fc2.weight"].T

    return out_sd

In [39]:
state_dict = torch.load(state_dict_path, weights_only=False)

tl_dict = biogpt_to_transformer_lens_format(state_dict, config.num_hidden_layers, config.num_attention_heads)
model.load_and_process_state_dict(tl_dict)

RuntimeError: The size of tensor a (64) must match the size of tensor b (4096) at non-singleton dimension 3