In [1]:
import logging
logging.basicConfig(
    style='{',
    format='{asctime} [{filename}:{lineno} in {funcName}] {levelname} - {message}',
    handlers=[
        logging.FileHandler(".log", 'w'),
        logging.StreamHandler()
    ],
    level=logging.INFO
)
logging.info('Importing...')
import os
import numpy as np
import torch
from torch.nn import Module, ModuleList
from transformers import PreTrainedModel
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from accelerate import init_empty_weights
from accelerate.utils import find_tied_parameters, named_module_tensors, set_module_tensor_to_device

from policy import Policy
logging.info('Done!')

2023-09-24 09:20:19,889 [361951628.py:11 in <module>] INFO - Importing...
2023-09-24 09:20:24,840 [instantiator.py:21 in <module>] INFO - Created a temporary directory at /tmp/tmpqsiukz4b
2023-09-24 09:20:24,843 [instantiator.py:76 in _write] INFO - Writing /tmp/tmpqsiukz4b/_remote_module_non_scriptable.py
2023-09-24 09:20:26.859436: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-24 09:20:31,895 [361951628.py:22 in <module>] INFO - Done!


In [2]:
checkpoint = "facebook/opt-13b" # 1.3b 6.7b 13b 30b 66b 
offload_folder = 'offload/' + checkpoint.replace('/', '.')

# empty model
config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
model.tie_weights()

In [3]:
class AttrDict(dict):
    __slots__ = () 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__


policy = Policy(
    gpu_batch_size=16, 
    num_gpu_batches=8, 
    weights_gpu_percent=0.0, 
    weights_cpu_percent=0.3, 
    cache_gpu_percent=0.0, 
    cache_cpu_percent=0.2, 
    act_gpu_percent=0.0, 
    act_cpu_percent=0.5, 
    overlap=True, 
    pin_weight=True,
)

def get_layers_dict(lm_model: Module, prefix: str='') -> dict:
    # return a dict of {layer_name : layer_module ('meta')} with only leaf nodes & transformer layers
    layers_dict = {}
    for name, module in lm_model.named_children():
        # leaf nodes
        if len(list(module.named_children())) == 0:
            layers_dict[prefix+name] = module
        # ModuleList: transformer  
        elif isinstance(module, ModuleList):
            for block_name, block_module in module.named_children():
                layers_dict[prefix+name+'.'+block_name] = block_module
        else:
            layers_dict.update(get_layers_dict(module, prefix+name+'.'))
    return layers_dict

def named_module_tensors(module: Module, include_buffers: bool = True, recurse: bool = True):
    for named_parameter in module.named_parameters(recurse=recurse):
        yield named_parameter

    if include_buffers:
        for named_buffer in module.named_buffers(recurse=recurse):
            yield named_buffer

def get_device(cur_percent, percents, choices):
    # choose a device (gpu / cpu / disk) for a weight tensor by its percent of size
    percents = np.cumsum(percents)
    assert np.abs(percents[-1] - 1.0) < 1e-5, f'{percents}'

    for i in range(len(percents)):
        if cur_percent < percents[i]:
            return choices[i]
    return choices[-1]

def get_policy_weight_map(model: PreTrainedModel, policy: Policy):
    """{module_name: device}"""
    assert model.device == torch.device('meta'), 'model is not on device meta.'
    
    # to ensure the tied params are allocated to the same device in the weight_map
    model.tie_weights()
    tied_params = find_tied_parameters(model)

    # layers to be scheduled
    layers_dict = get_layers_dict(model)

    # device assignment for each tensor in the model
    weight_assign_dict = {}
    devices = ['cuda', 'cpu', 'disk']
    percents_target = np.array([
        policy.weights_gpu_percent, 
        policy.weights_cpu_percent, 
        policy.weights_disk_percent
    ])
    
    # model size (parameters + buffers), here we do not repeatly sum the tied paramters 
    size_total = sum(np.prod(tensor.shape) for _, tensor in named_module_tensors(model))
    size_done, size_todo = 0, size_total
    percents_done, percents_todo = 0 * percents_target, percents_target  

    for layer_name, layer_module in layers_dict.items():
        # current layer
        tensor_sizes = [np.prod(tensor.shape) for _, tensor in named_module_tensors(layer_module)]
        tensor_sizes_cumsum = np.cumsum(tensor_sizes)

        device_allo_size_dict = {device: 0 for device in devices} # to balance the percents
        for i, (tensor_name, tensor) in enumerate(named_module_tensors(layer_module)):
            abs_tensor_name = layer_name + '.' + tensor_name

            def find_processed_tied(abs_tensor_name, tied_params, weight_assign_dict):
                # find the processed parameter (in weight_assign_dict) of the tied parameters.
                for tp in tied_params:
                    if abs_tensor_name in tp:
                        for p in tp:
                            if p in weight_assign_dict:
                                return p, tuple(tp)
                return None
            
            processed_tied = find_processed_tied(abs_tensor_name, tied_params, weight_assign_dict) 
            if processed_tied: # this tensor is tied and processed.
                p, tp = processed_tied
                weight_assign_dict[abs_tensor_name] = {
                    # 'shape':  tensor.shape,
                    'assigned_device': weight_assign_dict[p]['assigned_device'],
                    'tied': tp
                }
            else:
                mid_percent = (tensor_sizes_cumsum[i] - tensor_sizes[i] / 2) / tensor_sizes_cumsum[-1] # tensor mid size percent 
                device = get_device(mid_percent, percents_todo, devices)
                weight_assign_dict[abs_tensor_name] = {
                    'shape':  tensor.shape,
                    'assigned_device': device
                }
                
                device_allo_size_dict[device] += tensor_sizes[i]

        # update percents_todo
        size_layer = sum(device_allo_size_dict.values())
        if size_layer > 0:
            device_allo_percents = np.array([device_allo_size_dict[device] * 1. for device in devices]) / size_layer
            percents_done = (percents_done * size_done + device_allo_percents * size_layer) / (size_done + size_layer)      
        size_done += size_layer
        size_todo -= size_layer
        if size_todo > 0:
            percents_todo = (size_total * percents_target - size_done * percents_done) / size_todo 
        
        logging.info(f'{layer_name}, {percents_done}, size_todo: {size_todo}')


    device_map = {k:v['assigned_device'] for k, v in weight_assign_dict.items()}
    logging.info('device_map is prepared!')

    mem_g = sum([np.prod(v['shape']) for _, v in weight_assign_dict.items() if 'cuda' in v['assigned_device'] and 'shape' in v]) * 2 / (2 ** 30)
    mem_c = sum([np.prod(v['shape']) for _, v in weight_assign_dict.items() if v['assigned_device'] == 'cpu' and 'shape' in v]) * 2 / (2 ** 30)
    mem_d = sum([np.prod(v['shape']) for _, v in weight_assign_dict.items() if v['assigned_device'] == 'disk' and 'shape' in v]) * 2 / (2 ** 30)
    mem = mem_d + mem_c + mem_g
    logging.info(f'CausalLM {checkpoint} is to be loaded on: ' 
                 f'\nGPU Mem {mem_g:.2f} GiB ({mem_g / mem:.2%}), ' 
                 f'CPU Mem {mem_c:.2f} GiB ({mem_c / mem:.2%}), '
                 f'Disk Mem {mem_d:.2f} Gib ({mem_d / mem:.2%})')
    
    # prepare output
    output = {
        'model': model,
        'tied_params': tied_params,
        'layers_dict': layers_dict,
        'weight_assign_dict': weight_assign_dict,
        'device_map': device_map
    }
    output = AttrDict(output)
    return output

output = get_policy_weight_map(model, policy)
policy_device_map = output.device_map
flexgen_layers = output.layers_dict

2023-09-24 09:20:33,272 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.embed_tokens, [0. 0. 1.], size_todo: 12596080640
2023-09-24 09:20:33,273 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.embed_positions, [0. 0. 1.], size_todo: 12585584640
2023-09-24 09:20:33,275 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.final_layer_norm, [0.00000000e+00 1.91116887e-05 9.99980888e-01], size_todo: 12585574400
2023-09-24 09:20:33,277 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.layers.0, [0.         0.18003639 0.81996361], size_todo: 12270935040
2023-09-24 09:20:33,278 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.layers.1, [0.         0.23378988 0.76621012], size_todo: 11956295680
2023-09-24 09:20:33,280 [1132556772.py:124 in get_policy_weight_map] INFO - model.decoder.layers.2, [0.         0.25962997 0.74037003], size_todo: 11641656320
2023-09-24 09:20:33,281 [1132556772.py:124 in get_policy_weight

In [4]:
def check_disk(checkpoint, offload_folder):
    config = AutoConfig.from_pretrained(checkpoint)
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config)
    model.tie_weights()
    tensor_names = [n for n, _ in named_module_tensors(model, include_buffers=True, recurse=True)]
    dat_file_names = [file[:-4] for file in os.listdir(offload_folder) if file.endswith('.dat')]
    # logging.info(set(tensor_names) - set(dat_file_names))
    return set(tensor_names) == set(dat_file_names)

if not check_disk(checkpoint, offload_folder):
    # download and process to .dat files
    disk_weight_map = {name:'disk' for name in policy_device_map}
    try:
        AutoModelForCausalLM.from_pretrained(
            checkpoint, 
            device_map=disk_weight_map, 
            offload_folder=offload_folder, 
            offload_state_dict=True
        )
    except:
        pass

if check_disk(checkpoint, offload_folder):
    logging.info(f'The whole model has been downloaded an processed to offload_folder: \'{offload_folder}\'')
else:
    err_msg = 'Mismatch between offload folder and model'
    logging.error(err_msg)
    raise RuntimeError(err_msg)

# get empty model
config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
model.tie_weights()
model.eval()
logging.info(f'Got empty CausalLM: \'{checkpoint}\' on meta device.')

tied_params = find_tied_parameters(model)
print(tied_params)

2023-09-24 09:20:34,515 [43006092.py:25 in <module>] INFO - The whole model has been downloaded an processed to offload_folder: 'offload/facebook.opt-13b'
2023-09-24 09:20:34,846 [43006092.py:37 in <module>] INFO - Got empty CausalLM: 'facebook/opt-13b' on meta device.


[['lm_head.weight', 'model.decoder.embed_tokens.weight']]


In [5]:
def get_obj_from_name(lm_model, name):
    splits = name.split('.')
    module = lm_model
    for split in splits:
        if split == '': 
            continue 

        new_module = getattr(module, split)
        if new_module is None:
            raise ValueError(f"{module} has no attribute {split}.")
        module = new_module
    return module 

In [13]:
import json
from tqdm import tqdm 
import gc 

dat_files = [f for f in os.listdir(offload_folder) if f.endswith('.dat')]
with open(os.path.join(offload_folder, 'index.json'), 'r') as f:
    index = json.load(f) # {name: {dtype, shape}}

def get_tied_target(tensor_name):
    # if tensor_name is tied and without a .dat file, if it is not tied, return itself
    for group in tied_params:
        if tensor_name in group:
            for name in group:
                if name + '.dat' in dat_files:
                    return name 
    return tensor_name

def flexgen_load_module_tensor(model, tensor_name, device):
    old_tensor_name = tensor_name
    
    tensor_name = get_tied_target(tensor_name) 
    metadata = index[tensor_name]

    # copied from accelerate.utils.offload
    shape = tuple(metadata["shape"])
    if shape == ():
        # NumPy memory-mapped arrays can't have 0 dims so it was saved as 1d tensor
        shape = (1,)

    dtype = metadata["dtype"]
    if dtype == "bfloat16":
        # NumPy does not support bfloat16 so this was saved as a int16
        dtype = "int16"
    
    # load .dat file
    save_path = os.path.join(offload_folder, tensor_name + '.dat')

    # to device 
    np_memmap = np.memmap(save_path, dtype=dtype, shape=shape, mode='r') 
    tmp = torch.from_numpy(np_memmap).to(device) 
    set_module_tensor_to_device(model, old_tensor_name, device, tmp)

def flexgen_offload_module_tensor(model, tensor_name):
    tensor = get_obj_from_name(model, tensor_name)
    device = policy_device_map[tensor_name]
    device = device if device != 'disk' else 'meta' 
    if tensor.device != device:
        set_module_tensor_to_device(model, tensor_name, device, tensor) # gtoc, ctog

def policy_init(model, policy_device_map):
    for tensor_name, device in tqdm(policy_device_map.items(), desc='model init: loading by policy...'):
        if device != 'disk':
            flexgen_load_module_tensor(model, tensor_name, device) 

    logging.info('model has been loaded by policy.')        

policy_init(model, policy_device_map)

model init: loading by policy...: 100%|██████████| 645/645 [00:00<00:00, 12808.88it/s]
2023-09-24 09:28:53,018 [3257294438.py:55 in policy_init] INFO - model has been loaded by policy.


In [14]:
from accelerate.hooks import (
    ModelHook, 
    SequentialHook, 
    add_hook_to_module, 
    remove_hook_from_module
)
# TODO: add_zigzag_hook, remove_zigzag_hook

from accelerate.utils import (
    find_device,
    named_module_tensors,
    send_to_device,
    set_module_tensor_to_device,
)

# global buffers: {layer_name: value_holder}
weight_home = {}
weight_load_buf = {}

act_home = {}
act_load_buf = {}
act_store_buf = {}

kv_home = {} 
kv_load_buf = {}
kv_store_buf = {}

# TODO: cuda streams / cpu threads (?)


from typing import Optional, Union, Mapping
class LayerWeightHook(ModelHook):
    def __init__(
        self,
        model, 
        layer_name,
        compute_device,
    ):
        self.model = model
        self.layer_name = layer_name
        self.compute_device = compute_device
        
    def check_dat(self, dat_file):
        return os.path.isfile(dat_file)

    def init_hook(self, module):
        self.weight_names = [self.layer_name + '.' + name for name, _ in named_module_tensors(module, True, True)]
        
        self.actual_weight_names = [get_tied_target(w) for w in self.weight_names]
        dat_files = [os.path.join(offload_folder, w + '.dat') for w in self.actual_weight_names]
        assert all([self.check_dat(f) for f in dat_files]), f'dat file error, {dat_files}'
        
        return module 
    
    def pre_forward(self, module: Module, *args, **kwargs):
        print(f'pre {self.layer_name} forward')
        # load weights
        for w in self.weight_names:
            flexgen_load_module_tensor(model, w, self.compute_device)
        return args, kwargs
    def post_forward(self, module, output):
        print(f'post {self.layer_name} forward')
        # offload weights
        for w in self.weight_names:
            flexgen_offload_module_tensor(model, w)
        return output
    
    def detach_hook(self, module):
        return module 

class LayerActHook(ModelHook): pass 
class LayerKVCacheHook(ModelHook): pass 

def to_zigzag_forward(layer):
    pass 


# clear hooks 
remove_hook_from_module(model, recurse=True)

compute_device = 'cpu' 

for layer_name in flexgen_layers:
    layer_module = get_obj_from_name(model, layer_name)

    layer_weight_hook = LayerWeightHook(model=model, layer_name=layer_name, compute_device=compute_device)
    add_hook_to_module(layer_module, layer_weight_hook, append=True)
    # break

# generate test

prompts = [
    'Who are you? Are you conscious?',
    'Where is Deutschland?',
    'How is Huawei Mate 60 Pro?'
] * 4

prompt_len = 20

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompts, padding="max_length", max_length=prompt_len, return_tensors="pt")

# Generate
generate_ids = model.generate(
    inputs.input_ids, 
    max_length=30 + prompt_len,
    # num_beams=2,
    # num_beam_groups=2,
    # diversity_penalty=0.1,
    do_sample=True,
)

output_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for output_text in output_texts:
    print(output_text)
    print('-' * 10)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.




pre model.decoder.embed_tokens forward
post model.decoder.embed_tokens forward
pre model.decoder.embed_positions forward
post model.decoder.embed_positions forward
pre model.decoder.layers.0 forward
post model.decoder.layers.0 forward
pre model.decoder.layers.1 forward
post model.decoder.layers.1 forward
pre model.decoder.layers.2 forward
post model.decoder.layers.2 forward
pre model.decoder.layers.3 forward


In [None]:
layer_module._hf_hook

In [None]:
# load / offload
#     module object, .dat file path
#     layer: pre / post forward hook

In [None]:

def compute_activation_assignment(num_layers, offload_config: Policy):
    logging.debug(f"<compute_activation_assignment> enter")
    gpu_batch_limit = int(offload_config.num_gpu_batches * offload_config.act_gpu_percent)
    cpu_batch_limit = int(offload_config.num_gpu_batches * (offload_config.act_gpu_percent + offload_config.act_cpu_percent))
    logging.debug(f"<compute_activation_assignment> gpu_batch_limit: {gpu_batch_limit}, cpu_batch_limit: {cpu_batch_limit}")
    
    act_assign_dict = {}
    for l in range(num_layers):
        for i in range(offload_config.num_gpu_batches):
            act_key = f"layer.{l}_index.{i}"
            if i < gpu_batch_limit:
                device = 'cuda'
            elif i < cpu_batch_limit:
                device = 'cpu'
            else:
                device = 'disk'
            act_assign_dict[act_key]= {'assigned_device': device}
    return act_assign_dict


def compute_kv_cache_assignment(num_layers, offload_config: OffloadConfig):
    logging.debug(f"<compute_kv_cache_assignment> enter")
    gpu_batch_limit = int(offload_config.num_gpu_batches * offload_config.cache_gpu_percent)
    cpu_batch_limit = int(offload_config.num_gpu_batches * (offload_config.cache_gpu_percent + offload_config.cache_cpu_percent))
    logging.debug(f"<compute_kv_cache_assignment> gpu_batch_limit: {gpu_batch_limit}, cpu_batch_limit: {cpu_batch_limit}")
    
    act_assign_dict = {}
    for l in range(num_layers):
        for i in range(offload_config.num_gpu_batches):
            key_cache_key = f"key_layer.{l}_index.{i}"
            value_cache_key = f"key_layer.{l}_index.{i}"
            if i < gpu_batch_limit:
                device = 'cuda'
            elif i < cpu_batch_limit:
                device = 'cpu'
            else:
                device = 'disk'
            act_assign_dict[key_cache_key] = {'assigned_device': device}
            act_assign_dict[value_cache_key] = {'assigned_device': device}
    return act_assign_dict
