In [1]:
from transformers import OPTForCausalLM, AutoModelForCausalLM, MixtralForCausalLM
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = OPTForCausalLM.from_pretrained('facebook/opt-13b', torch_dtype=torch.float16)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.30it/s]


In [6]:
model.config.hidden_size

5120

In [7]:
dec = model.model.decoder

In [8]:
one_layer = dec.layers[0]

In [9]:
one_layer

OPTDecoderLayer(
  (self_attn): OPTAttention(
    (k_proj): Linear(in_features=5120, out_features=5120, bias=True)
    (v_proj): Linear(in_features=5120, out_features=5120, bias=True)
    (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
    (out_proj): Linear(in_features=5120, out_features=5120, bias=True)
  )
  (activation_fn): ReLU()
  (self_attn_layer_norm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=5120, out_features=20480, bias=True)
  (fc2): Linear(in_features=20480, out_features=5120, bias=True)
  (final_layer_norm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
)

In [None]:
five_layers = dec.layers[:5]

In [None]:
# save to np.memmap files
# choices: 1) dir + files, 2) one file => choose 1
model_dir = './_model_dir'
import os 

os.makedirs(model_dir, exist_ok=True)



In [None]:
# problem: parameters, buffers, state_dict
sub_model = five_layers


In [None]:
set(sub_model.state_dict().keys())  == set(list(n for n, p in sub_model.named_parameters()))

True

In [None]:
list(sub_model.named_buffers())

[]

In [None]:
# questions: what are the buffers in llm models?
# answer: 1) norm layer mean/std, 2) embedding, 3) dropout mask
#       ~ 0.5GB, very small
# it is ok to regard buffers as a part of the model weights
for name, param in model.named_buffers():
    print(name, param.shape)

In [None]:
from accelerate.utils import named_module_tensors
nmt = named_module_tensors(sub_model, include_buffers=True, recurse=True)

# nmt to np.memmap 
for n, t in nmt:
    path = os.path.join(model_dir, n)
    print(path)

./_model_dir/0.self_attn.k_proj.weight
./_model_dir/0.self_attn.k_proj.bias
./_model_dir/0.self_attn.v_proj.weight
./_model_dir/0.self_attn.v_proj.bias
./_model_dir/0.self_attn.q_proj.weight
./_model_dir/0.self_attn.q_proj.bias
./_model_dir/0.self_attn.out_proj.weight
./_model_dir/0.self_attn.out_proj.bias
./_model_dir/0.self_attn_layer_norm.weight
./_model_dir/0.self_attn_layer_norm.bias
./_model_dir/0.fc1.weight
./_model_dir/0.fc1.bias
./_model_dir/0.fc2.weight
./_model_dir/0.fc2.bias
./_model_dir/0.final_layer_norm.weight
./_model_dir/0.final_layer_norm.bias
./_model_dir/1.self_attn.k_proj.weight
./_model_dir/1.self_attn.k_proj.bias
./_model_dir/1.self_attn.v_proj.weight
./_model_dir/1.self_attn.v_proj.bias
./_model_dir/1.self_attn.q_proj.weight
./_model_dir/1.self_attn.q_proj.bias
./_model_dir/1.self_attn.out_proj.weight
./_model_dir/1.self_attn.out_proj.bias
./_model_dir/1.self_attn_layer_norm.weight
./_model_dir/1.self_attn_layer_norm.bias
./_model_dir/1.fc1.weight
./_model_dir/1

In [None]:
from accelerate.utils import named_module_tensors
nmt = named_module_tensors(sub_model, include_buffers=True, recurse=True)

import numpy as np 
from numpy.lib.format import open_memmap

# nmt to np.memmap 
for n, t in nmt:
    path = os.path.join(model_dir, n)
    # save 
    np_t = t.detach().numpy()
    open_memmap(path, mode="w+", shape=np_t.shape, dtype=np_t.dtype)

In [None]:
# model
#   init:           state0 (empty) -> state1
#   layer by layer: state1 (partially loaded) -> state2 (fully loaded) -> state3 (partially loaded)
