In [1]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, OPTForCausalLM, MistralForCausalLM
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
from accelerate.hooks import remove_hook_from_module
from accelerate.utils import named_module_tensors, find_tied_parameters


import numpy as np
from numpy.lib.format import open_memmap

import os
import sys
import json

from threading import Thread
from queue import Queue 

import functools 

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
checkpoint = 'facebook/opt-125m'
# checkpoint = 'facebook/opt-13B'
# checkpoint = 'mistralai/Mistral-7B-v0.1'

comp_device = 0
torch_dtype = torch.float16

In [3]:
"""
1. get model parameter & buffer names
2. find the transformer block module
3. get a device map
4. get offloaded weights np.memmap files
"""
config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights(): 
    e: OPTForCausalLM = AutoModelForCausalLM.from_config(config,)
# don't run e.tie_weights() or the tied weights will not be in the device map
# e.tie_weights()
    
def find_module_list(module: nn.Module):
    def _find_module_list(module: nn.Module, prefix=''):
        if isinstance(module, nn.ModuleList):
            yield module, prefix
        else:
            for name, child in module.named_children():
                yield from _find_module_list(child, prefix=prefix+'.'+name if prefix else name)
    
    g = _find_module_list(module)
    try:
        return next(iter(g))
    except:
        raise ValueError(f'{module.__class__.__name__} does not have a nn.ModuleList structure')

layers, layers_name = find_module_list(e)
# layers_name

res = {}
for n, t in named_module_tensors(e, recurse=True):
    if isinstance(t, nn.Parameter) and layers_name in n:
        res[n] = 'disk'
    else:
        res[n] = comp_device

weights_offload_folder = f'_weights_offload/{checkpoint}/{torch_dtype}'

# all parameters of the model will be offloaded as memory-mapped array in a given folder.
if not os.path.exists(weights_offload_folder):
    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map=res, torch_dtype=torch_dtype, offload_folder=weights_offload_folder, use_safetensors=False) # use pytorch *.bin, as disk_offload have some bugs for safetensors
else:
    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map=res, torch_dtype=torch_dtype, offload_folder=None) 

# remove accelerate disk_offload hooks
model = remove_hook_from_module(model, recurse=True) 





In [25]:
class DiskWeightsLoader:
    def __init__(self, weights_offload_folder) -> None:
        self.weights_offload_folder = weights_offload_folder

        with open(os.path.join(weights_offload_folder, "index.json"), "r") as f: 
            self.index = json.load(f)  

    def open_memmap(self, key):
        metadata = self.index[key]

        f_name = os.path.join(weights_offload_folder, key + '.dat')

        shape = tuple(metadata["shape"])
        if shape == ():
            # NumPy memory-mapped arrays can't have 0 dims so it was saved as 1d tensor
            shape = (1,)

        dtype = metadata["dtype"]
        if dtype == "bfloat16":
            # NumPy does not support bfloat16 so this was saved as a int16
            dtype = "int16"

        weight = np.memmap(f_name, dtype=dtype, shape=shape, mode="r")

        if len(metadata["shape"]) == 0:
            weight = weight[0]

        weight = torch.from_numpy(weight) # no data movement

        if metadata["dtype"] == "bfloat16":
            weight = weight.view(torch.bfloat16)

        return weight
    
dl = DiskWeightsLoader(weights_offload_folder)
dl.open_memmap("model.decoder.layers.0.fc1.bias")


tensor([-0.0139, -0.0025, -0.0152,  ..., -0.0056, -0.0118, -0.0037])

In [6]:

import torch
from accelerate.utils import honor_type
from typing import Mapping

def get_info(obj, debug=False):
    if isinstance(obj, (tuple, list)):
        ret = honor_type(obj, (get_info(o) for o in obj))
        if len(set(ret)) == 1 and len(ret) > 1:
            return f"{len(ret)} * {ret[0]}"
        else:
            return ret 
    elif isinstance(obj, Mapping):
        return type(obj)({k: get_info(v) for k, v in obj.items()})
    elif isinstance(obj, (torch.Tensor)):
        if debug:
            return f"{obj.__class__.__name__}(shape={tuple(obj.size())}, dtype={obj.dtype}, device={obj.device}, mem/elem/dtype={sys.getsizeof(obj.storage()) / obj.numel() / obj.element_size():.3f})"
        else:
            return f"{obj.__class__.__name__}(shape={tuple(obj.size())}, mem/elem/dtype={sys.getsizeof(obj.storage()) / obj.numel() / obj.element_size():.3f})"
    elif isinstance(obj, (int, bool, type(None))):
        return f"{obj}"
    else:
        return f"{obj.__class__.__name__}: {obj}"

from data_movement import Engine, Task

class Model:
    """
    1. override forward functions
    """
    def __init__(self, hf_model, comp_device=0, **kwargs) -> None:
        # self.checkpoint = kwargs.get('checkpoint')
        # self.torch_dtype = kwargs.get('torch_dtype')
        # self.config = AutoConfig.from_pretrained(self.checkpoint, torch_dtype=self.torch_dtype)
        # with init_empty_weights(): # while buffers are not empty
        #     self.hf_model = AutoModelForCausalLM.from_config(self.config, torch_dtype=self.torch_dtype)
        
        self.comp_device = comp_device

        self.dm_engine = Engine(self.comp_device)

        # init model 
        self.hf_model = hf_model#.to(comp_device)
        self.layers, self.layers_name = self.get_layers()


    def get_layers(self) -> tuple[nn.Module, str]:
        if isinstance(self.hf_model, (OPTForCausalLM, )):
            return self.hf_model.model.decoder.layers, 'model.decoder.layers'
        else:
            def find_module_list(module: nn.Module) -> tuple[nn.Module, str]:
                def _find_module_list(module: nn.Module, prefix=''):
                    if isinstance(module, nn.ModuleList):
                        yield module, prefix
                    else:
                        for name, child in module.named_children():
                            yield from _find_module_list(child, prefix=prefix+'.'+name if prefix else name)
                
                g = _find_module_list(module)
                try:
                    return next(iter(g))
                except:
                    raise ValueError(f'{module.__class__.__name__} does not have a nn.ModuleList structure')

            return find_module_list(self.hf_model)
    
    def override_layer_forward(self, i: int):
        layer = self.layers[i]
        old_forward = layer.forward

        @functools.wraps(old_forward)
        def new_forward(*args, **kwargs):
            print(f'\t{i = }, {get_info(args) = }, \n\t{i = }, {get_info(kwargs) = }')

            if isinstance(self.hf_model, (OPTForCausalLM, )):
                actv_recomp = args[0] # b,1,h / bzh
                kv_cache = kwargs.get('past_key_value') # b,n_kv_heads,s_cache,h_kv    x2
                attn_mask = kwargs.get('attention_mask') # b,1,1,s_all  (bsz, 1, tgt_len, src_len)

            # new to hf: args, kwargs
            args_for_old = args
            kwargs_for_old = kwargs

            # hf execution
            old_output = old_forward(*args_for_old, **kwargs_for_old) # h'=(b,z,h), kv=(b,n,s_all,h) x2
            
            # hf to new: output
            output = old_output
            print(f'\t{i = }, {get_info(output) = }\n')
            
            return output
        
        layer.forward = new_forward
        return layer

    def override_hf_model_forward(self):
        old_forward = self.hf_model.forward
        @functools.wraps(old_forward)
        def new_forward(*args, **kwargs):
            print(f'hf_model {get_info(args) = }, \nhf_model {get_info(kwargs) = }\n')

            # new to hf: args, kwargs
            args_for_old = args
            kwargs_for_old = kwargs

            # hf execution
            old_output = old_forward(*args_for_old, **kwargs_for_old) 

            # hf to new: output
            output = old_output 
            print(f'hf_model {get_info(output) = }\n')
            
            return output
        
        self.hf_model.forward = new_forward
        return self.hf_model

    def build(self):
        for i, _ in enumerate(self.layers):
            self.override_layer_forward(i)
        self.override_hf_model_forward()
        return self.hf_model 



In [7]:
num_prompts = 16
prompts = None
prompt_len = 50
comp_device = 0
gen_len = 20


# hf_model= OPTForCausalLM.from_pretrained(checkpoint)
model = Model(m, comp_device=comp_device).build()

# test
if True:
    if prompts is None:  # get default prompts
        prompts = [
            "for i in range(10): ",
            "Who are you? Are you conscious?",
            "Where is Deutschland?",
            "How is Huawei Mate 60 Pro?",
        ]
    prompts = (
        prompts * (num_prompts // len(prompts))
        + prompts[: (num_prompts % len(prompts))]
    )

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint) # , padding_side="left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # eos padding

    # inputs
    inputs = tokenizer(
        prompts,
        padding="max_length",
        max_length=prompt_len,
        return_tensors="pt",
        # padding=True,
    ).to(comp_device)

    # generate
    generate_ids = model.generate(
        inputs.input_ids,
        max_new_tokens=gen_len,  # max_lengths
        
        num_beams=6, #
        num_beam_groups=2, #
        diversity_penalty=0.1, #
        # do_sample=True, #
    )

    # outputs
    output_texts = tokenizer.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(output_texts)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  return f"{obj.__class__.__name__}(shape={tuple(obj.size())}, mem/elem/dtype={sys.getsizeof(obj.storage()) / obj.numel() / obj.element_size():.3f})"


hf_model get_info(args) = (), 
hf_model get_info(kwargs) = {'input_ids': 'Tensor(shape=(96, 50), mem/elem/dtype=1.001)', 'past_key_values': 'None', 'use_cache': 'True', 'attention_mask': 'Tensor(shape=(96, 50), mem/elem/dtype=1.001)', 'return_dict': 'True', 'output_attentions': 'False', 'output_hidden_states': 'False'}

	i = 0, get_info(args) = ('Tensor(shape=(96, 50, 768), mem/elem/dtype=1.000)',), 
	i = 0, get_info(kwargs) = {'attention_mask': 'Tensor(shape=(96, 1, 50, 50), mem/elem/dtype=1.000)', 'layer_head_mask': 'None', 'past_key_value': 'None', 'output_attentions': 'False', 'use_cache': 'True'}


RuntimeError: Tensor on device meta is not on the expected device cuda:0!