In [1]:
import logging
logging.basicConfig(
    style='{',
    format='{asctime} [{filename}:{lineno} in {funcName}] {levelname} - {message}',
    handlers=[
        logging.FileHandler(".log", 'w'),
        logging.StreamHandler()
    ],
    level=logging.INFO
)
logging.info('Importing...')
from dataclasses import dataclass
import numpy as np
import torch
from torch.nn import Module, ModuleList
from transformers import PreTrainedModel
from transformers import AutoModelForCausalLM, AutoConfig
from accelerate import load_checkpoint_and_dispatch, init_empty_weights
logging.info('Done!')

checkpoint = "facebook/opt-6.7b" # 6.7b 13b 30b 66b 

logging.info(f'Initializing CausalLM: \'{checkpoint}\'')
config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

from accelerate.utils import (
    check_tied_parameters_on_same_device,
    find_tied_parameters,
    get_balanced_memory,
    get_max_memory,
    load_offloaded_weights,
    offload_weight,
    save_offload_index,
    set_module_tensor_to_device,
)
    
model.tie_weights()
tied_params = find_tied_parameters(model)
# model.base_model_prefix

2023-09-19 13:49:58,020 [4101423154.py:11 in <module>] INFO - Importing...
2023-09-19 13:50:07,036 [instantiator.py:21 in <module>] INFO - Created a temporary directory at /tmp/tmphgcqvz6l
2023-09-19 13:50:07,040 [instantiator.py:76 in _write] INFO - Writing /tmp/tmphgcqvz6l/_remote_module_non_scriptable.py
2023-09-19 13:50:10.576196: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-19 13:50:19,622 [4101423154.py:19 in <module>] INFO - Done!
2023-09-19 13:50:19,624 [4101423154.py:23 in <module>] INFO - Initializing CausalLM: 'facebook/opt-6.7b'


In [2]:
class AttrDict(dict):
    __slots__ = () 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

@dataclass(frozen=True)
class Policy:
    gpu_batch_size: int
    num_gpu_batches: int

    # percent of weights/cache/activations on GPU/CPU/Disk %
    weights_gpu_percent: float
    weights_cpu_percent: float
    cache_gpu_percent: float
    cache_cpu_percent: float
    act_gpu_percent: float
    act_cpu_percent: float

    # Whether to overlap the I/O and compute
    overlap: bool

    # Whether to use pinned memory for weights on CPU
    pin_weight: bool

    @property
    def weights_disk_percent(self):
        return 1.0 - self.weights_gpu_percent - self.weights_cpu_percent

    @property
    def cache_disk_percent(self):
        return 1.0 - self.cache_gpu_percent - self.cache_cpu_percent

    @property
    def act_disk_percent(self):
        return 1.0 - self.act_gpu_percent - self.act_cpu_percent

policy = Policy(
    gpu_batch_size=8, 
    num_gpu_batches=8, 
    weights_gpu_percent=0.0, 
    weights_cpu_percent=0.3, 
    cache_gpu_percent=0.0, 
    cache_cpu_percent=0.2, 
    act_gpu_percent=0.0, 
    act_cpu_percent=0.5, 
    overlap=True, 
    pin_weight=True,
)

def get_policy_weight_map(lm_model: PreTrainedModel, policy: Policy):
    """{module_name: device}"""
    assert lm_model.device == torch.device('meta')

    def get_layers_dict(lm_model: Module, prefix: str='') -> dict:
        # {layer_name : layer_module ('meta')}
        layers_dict = {}
        for name, module in lm_model.named_children():
            if len(list(module.named_children())) == 0:
                layers_dict[prefix+name] = module
            # Assume only transformer blocks are stored in ModuleList
            elif isinstance(module, ModuleList):
                for block_name, block_module in module.named_children():
                    layers_dict[prefix+name+'.'+block_name] = block_module
            else:
                layers_dict.update(get_layers_dict(module, prefix+name+'.'))
        return layers_dict
    layers_dict = get_layers_dict(lm_model)

    
    def get_choice(cur_percent, percents, choices):
        percents = np.cumsum(percents)
        assert np.abs(percents[-1] - 1.0) < 1e-5, f'{percents}'

        for i in range(len(percents)):
            if cur_percent < percents[i]:
                return choices[i]
        return choices[-1]
    weight_assign_dict = {}

    choices = ['cuda', 'cpu', 'disk']
    percents_target = [policy.weights_gpu_percent, policy.weights_cpu_percent, policy.weights_disk_percent]
    percents_target = np.array(percents_target)
    
    size_total = sum([
        sum([
            np.prod(para.shape) for _, para in layer_module.named_parameters()
        ]) for _, layer_module in layers_dict.items()
    ])
    size_past, size_future = 0, size_total
    percents_past, percents_future = 0 * percents_target, percents_target  

    for layer_name, layer_module in layers_dict.items():
        # current layer
        param_sizes = [np.prod(para.shape) for _, para in layer_module.named_parameters()]
        param_sizes_cumsum = np.cumsum(param_sizes)
        size_layer = param_sizes_cumsum[-1]

        size_layer_devices = {device: 0 for device in choices}
        for i, (param_name, param) in enumerate(layer_module.named_parameters()):
            param_mid = (param_sizes_cumsum[i] - param_sizes[i] / 2) / param_sizes_cumsum[-1]
            device = get_choice(param_mid, percents_future, choices)

            weight_assign_dict[layer_name+'.'+param_name] = {
                'shape':  param.shape,
                'assigned_device': device
            }
            size_layer_devices[device] += param_sizes[i]

        percents_layer = np.array([size_layer_devices[device] * 1. for device in choices]) / size_layer
        
        # update past & future
        percents_past = (percents_past * size_past + percents_layer * size_layer) / (size_past + size_layer)      
        size_past += param_sizes_cumsum[-1]
        size_future -= param_sizes_cumsum[-1]
        percents_future = (size_total * percents_target - size_past * percents_past) / size_future if size_future > 0 else 0
        
        # logging.info(percents_past)


    mem_g = sum([np.prod(v['shape']) for k, v in weight_assign_dict.items() if 'cuda' in v['assigned_device']]) * 2 / (2 ** 30)
    mem_c = sum([np.prod(v['shape']) for k, v in weight_assign_dict.items() if v['assigned_device'] == 'cpu']) * 2 / (2 ** 30)
    mem_d = sum([np.prod(v['shape']) for k, v in weight_assign_dict.items() if v['assigned_device'] == 'disk']) * 2 / (2 ** 30)
    mem = mem_d + mem_c + mem_g
    logging.info(f'Loading weights of CausalLM\n {checkpoint}: ' 
                 f'GPU Mem {mem_g:.2f} GiB ({mem_g / mem:.2%}), ' 
                 f'CPU Mem {mem_c:.2f} GiB ({mem_c / mem:.2%}), '
                 f'Disk Mem {mem_d:.2f} Gib ({mem_d / mem:.2%})')

    device_map = {k:v['assigned_device'] for k, v in weight_assign_dict.items()}

    # prepare output
    output = {
        'model': model,
        'layers_dict': layers_dict,
        'weight_assign_dict': weight_assign_dict,
        'device_map': device_map
    }
    output = AttrDict(output)
    return output

output = get_policy_weight_map(model, policy)

2023-09-19 13:50:20,681 [4062821495.py:124 in get_policy_weight_map] INFO - Loading weights of CausalLM
 facebook/opt-6.7b: GPU Mem 0.00 GiB (0.00%), CPU Mem 3.72 GiB (29.09%), Disk Mem 9.07 Gib (70.91%)


In [3]:
device_map = output.device_map
device_map

# check_tied_parameters_on_same_device(tied_params, device_map)
# for tie_param in tied_params:
#     dev = set([device_map[p] for p in tie_param]).pop()
#     for p in tie_param:
#         device_map[p] = dev
    

{'model.decoder.embed_tokens.weight': 'disk',
 'model.decoder.embed_positions.weight': 'disk',
 'model.decoder.final_layer_norm.weight': 'cpu',
 'model.decoder.final_layer_norm.bias': 'disk',
 'model.decoder.layers.0.self_attn.k_proj.weight': 'cpu',
 'model.decoder.layers.0.self_attn.k_proj.bias': 'cpu',
 'model.decoder.layers.0.self_attn.v_proj.weight': 'cpu',
 'model.decoder.layers.0.self_attn.v_proj.bias': 'cpu',
 'model.decoder.layers.0.self_attn.q_proj.weight': 'cpu',
 'model.decoder.layers.0.self_attn.q_proj.bias': 'cpu',
 'model.decoder.layers.0.self_attn.out_proj.weight': 'cpu',
 'model.decoder.layers.0.self_attn.out_proj.bias': 'disk',
 'model.decoder.layers.0.self_attn_layer_norm.weight': 'disk',
 'model.decoder.layers.0.self_attn_layer_norm.bias': 'disk',
 'model.decoder.layers.0.fc1.weight': 'disk',
 'model.decoder.layers.0.fc1.bias': 'disk',
 'model.decoder.layers.0.fc2.weight': 'disk',
 'model.decoder.layers.0.fc2.bias': 'disk',
 'model.decoder.layers.0.final_layer_norm.w

In [4]:



from huggingface_hub import snapshot_download
weights_location = snapshot_download(checkpoint, allow_patterns=["*.bin", 'pytorch_model.bin.index.json'])
model.model = load_checkpoint_and_dispatch(
    output.model.model, # should be base model? e.g. output.model.model
    weights_location,
    device_map=device_map, 
    offload_folder='offload/' + checkpoint.replace('/', '.'), 
    offload_state_dict=True
)

# model = AutoModelForCausalLM.from_pretrained(
#     checkpoint, 
#     device_map=device_map, 
#     offload_folder='offload/' + checkpoint.replace('/', '.'), 
#     offload_state_dict=True
# )

logging.info(f'Model initialized!')

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

ValueError: decoder.embed_tokens.weight doesn't have any device set.

In [None]:
model.lm_head.weight

Parameter containing:
tensor(..., device='meta', size=(50272, 4096), requires_grad=True)

In [None]:
output.layers_dict.keys()

dict_keys(['model.decoder.embed_tokens', 'model.decoder.embed_positions', 'model.decoder.final_layer_norm', 'model.decoder.layers.0', 'model.decoder.layers.1', 'model.decoder.layers.2', 'model.decoder.layers.3', 'model.decoder.layers.4', 'model.decoder.layers.5', 'model.decoder.layers.6', 'model.decoder.layers.7', 'model.decoder.layers.8', 'model.decoder.layers.9', 'model.decoder.layers.10', 'model.decoder.layers.11', 'model.decoder.layers.12', 'model.decoder.layers.13', 'model.decoder.layers.14', 'model.decoder.layers.15', 'model.decoder.layers.16', 'model.decoder.layers.17', 'model.decoder.layers.18', 'model.decoder.layers.19', 'model.decoder.layers.20', 'model.decoder.layers.21', 'model.decoder.layers.22', 'model.decoder.layers.23', 'model.decoder.layers.24', 'model.decoder.layers.25', 'model.decoder.layers.26', 'model.decoder.layers.27', 'model.decoder.layers.28', 'model.decoder.layers.29', 'model.decoder.layers.30', 'model.decoder.layers.31', 'lm_head'])

In [None]:
# load / offload
#     module object, .dat file path
#     layer: pre / post forward hook

In [None]:

def compute_activation_assignment(num_layers, offload_config: Policy):
    logging.debug(f"<compute_activation_assignment> enter")
    gpu_batch_limit = int(offload_config.num_gpu_batches * offload_config.act_gpu_percent)
    cpu_batch_limit = int(offload_config.num_gpu_batches * (offload_config.act_gpu_percent + offload_config.act_cpu_percent))
    logging.debug(f"<compute_activation_assignment> gpu_batch_limit: {gpu_batch_limit}, cpu_batch_limit: {cpu_batch_limit}")
    
    act_assign_dict = {}
    for l in range(num_layers):
        for i in range(offload_config.num_gpu_batches):
            act_key = f"layer.{l}_index.{i}"
            if i < gpu_batch_limit:
                device = 'cuda'
            elif i < cpu_batch_limit:
                device = 'cpu'
            else:
                device = 'disk'
            act_assign_dict[act_key]= {'assigned_device': device}
    return act_assign_dict


def compute_kv_cache_assignment(num_layers, offload_config: OffloadConfig):
    logging.debug(f"<compute_kv_cache_assignment> enter")
    gpu_batch_limit = int(offload_config.num_gpu_batches * offload_config.cache_gpu_percent)
    cpu_batch_limit = int(offload_config.num_gpu_batches * (offload_config.cache_gpu_percent + offload_config.cache_cpu_percent))
    logging.debug(f"<compute_kv_cache_assignment> gpu_batch_limit: {gpu_batch_limit}, cpu_batch_limit: {cpu_batch_limit}")
    
    act_assign_dict = {}
    for l in range(num_layers):
        for i in range(offload_config.num_gpu_batches):
            key_cache_key = f"key_layer.{l}_index.{i}"
            value_cache_key = f"key_layer.{l}_index.{i}"
            if i < gpu_batch_limit:
                device = 'cuda'
            elif i < cpu_batch_limit:
                device = 'cpu'
            else:
                device = 'disk'
            act_assign_dict[key_cache_key] = {'assigned_device': device}
            act_assign_dict[value_cache_key] = {'assigned_device': device}
    return act_assign_dict


NameError: name 'OffloadConfig' is not defined