In [1]:
import numpy as np
import torch
from torch.nn import Module 
import functools 

from flexgen_utils import logging, Policy, get_module_from_name

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

from flexgen_init import policy_init

2023-10-04 08:07:36,847 [instantiator.py:21 in <module>] INFO - Created a temporary directory at /tmp/tmpzmr40fn3
2023-10-04 08:07:36,848 [instantiator.py:76 in _write] INFO - Writing /tmp/tmpzmr40fn3/_remote_module_non_scriptable.py


In [2]:
checkpoint = "facebook/opt-125m"

policy = Policy(
    gpu_batch_size=8, 
    num_gpu_batches=4, 
    weights_gpu_percent=0.0, 
    weights_cpu_percent=0.3, 
    cache_gpu_percent=0.0, 
    cache_cpu_percent=0.2, 
    act_gpu_percent=0.0, 
    act_cpu_percent=0.5, 
    overlap=True, 
    pin_weight=True,
)

# for test
gbs = policy.gpu_batch_size
ngb = policy.num_gpu_batches
num_prompts = ngb * gbs 

# model init
output = policy_init(checkpoint, policy)

model = output.model
weight_map = output.weight_map
layer_names = output.layer_names
index = output.index
dat_files = output.dat_files
tied_params = output.tied_params
offload_folder = output.offload_folder

2023-10-04 08:07:37,247 [connectionpool.py:1003 in _new_conn] DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2023-10-04 08:07:37,304 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/config.json HTTP/1.1" 200 0
2023-10-04 08:07:37.945701: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-04 08:07:38,909 [tpu_cluster_resolver.py:32 in <module>] DEBUG - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2023-10-04 08:07:39,079 [__init__.py:47 in <module>] DEBUG - Creating converter from 7 to 5
2023-10-04 08:07:39,080 [__init__.py:47 in <module>] DEBUG - Creating converter from 5 to 7
2023-10-04 08:

In [3]:
l1 = (1, 'b', ('c', ['d']))
l2 = (1, 'f', ('g', ['h']))
l3 = (1, 'j', ('k', ['l']))
ls = [l1, l2, l3]
def f(ls):
    # l0 = ls[0]
    ans = []
    for elem in zip(*ls):
        if isinstance(elem[0], str):
            ans.append(''.join(elem))
        elif isinstance(elem[0], tuple):
            ans.append(tuple(f(elem)))
        elif isinstance(elem[0], list):
            ans.append(f(elem))
        else: # int
            ans.append(elem[0])
        print(elem)
    return ans
f(ls)

(1, 1, 1)
('b', 'f', 'j')
('c', 'g', 'k')
('d', 'h', 'l')
(['d'], ['h'], ['l'])
(('c', ['d']), ('g', ['h']), ('k', ['l']))


[1, 'bfj', ('cgk', ['dhl'])]

In [4]:
import os 
from accelerate.utils import named_module_tensors 
from flexgen_utils import get_tied_target
from flexgen_utils import flexgen_load_module_tensor, flexgen_offload_module_tensor

def load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files):
    logger.debug(f'load_layer_weights: {layer_name} to {compute_device}')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    layer_dat_files = [os.path.join(offload_folder, get_tied_target(w, tied_params, dat_files) + '.dat') for w in weight_names]
    assert all([os.path.isfile(f) for f in layer_dat_files]), f'dat file error, {dat_files}'
    
    for w in weight_names:
        flexgen_load_module_tensor(model, w, compute_device, index, offload_folder, tied_params)


def offload_layer_weights(model, layer_name, weight_map):
    logger.debug(f'offload_layer_weights: {layer_name}')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    for w in weight_names:
        flexgen_offload_module_tensor(model, w, weight_map) 

def get_size_info(obj): # recursive
    if isinstance(obj, tuple):
        return tuple(get_size_info(o) for o in obj)
    elif isinstance(obj, list):
        return list(get_size_info(o) for o in obj)
    elif isinstance(obj, dict):
        return {k:get_size_info(v) for k, v in obj.items()}
    elif isinstance(obj, torch.Tensor):
        return obj.size()
    else:
        return type(obj)

def get_kth_batch_inputs(inputs, k, gpu_batch_size): # for both args, kwargs
    if isinstance(inputs, tuple):
        return tuple(get_kth_batch_inputs(inp, k, gpu_batch_size) for inp in inputs)
    elif isinstance(inputs, dict):
        return {key:get_kth_batch_inputs(value, k, gpu_batch_size) for key, value in inputs.items()}
    elif isinstance(inputs, torch.Tensor):
        return inputs[k * gpu_batch_size:(k + 1) * gpu_batch_size]
    else: # int 
        return inputs

def concat_outputs(outputs: list): # concat K outputs to one output
    if isinstance(outputs[0], torch.Tensor):
        return torch.cat(outputs, dim=0)
    elif isinstance(outputs[0], tuple):
        ans = []
        for elem in zip(*outputs):
            if isinstance(elem[0], torch.Tensor):
                ans.append(torch.cat(elem, dim=0))
            elif isinstance(elem[0], tuple):
                ans.append(tuple(concat_outputs(elem)))
            else: # all the same
                ans.append(elem[0])
        return tuple(ans) 
    else:
        raise NotImplementedError(f'outputs concat function of type {type(outputs[0])} is not implemented.')


def to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder, ngb, gbs):
    # rewrite the j-th layer's forward
    
    layer_name = layer_names[j]
    next_layer_name = layer_names[(j + 1) % len(layer_names)]

    layer = get_module_from_name(model, layer_name)  
    if hasattr(layer, "_flexgen_old_forward"): # has been rewriten
        return layer 
    
    logger.debug(f'{layer_name} to flexgen forward')
    layer._flexgen_old_forward = old_forward = layer.forward 

    @functools.wraps(old_forward)
    def new_forward(*args, **kwargs):
        # pre fwd: load curr & next weights
        load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files)
        load_layer_weights(model, next_layer_name, compute_device, offload_folder, dat_files)
        
        # loop forward pass of K minibatches
        with torch.no_grad():
            # output = old_forward(*args, **kwargs)
            # logger.debug(f'output: {get_size_info(output)}')

            # args_0 = get_kth_batch_inputs(args, 0, gbs)
            # kwargs_0 = get_kth_batch_inputs(kwargs, 0, gbs)
            # output_0 = old_forward(*args_0, **kwargs_0)
            # logger.debug(f'output0: {get_size_info(output_0)}')

            outputs = []
            for k in range(ngb):
                logger.debug(f'layer: {layer_name}, batch: {k}')
                # pre fwd: load curr & next inputs, store prev output
                args_k = get_kth_batch_inputs(args, k, gbs)
                kwargs_k = get_kth_batch_inputs(kwargs, k, gbs)

                output = old_forward(*args_k, **kwargs_k)
                outputs.append(output) 
                
                # post fwd: offload curr inputs

            logger.debug(f'before concat: {ngb} x {get_size_info(outputs[0])}')
            output = concat_outputs(outputs)
            logger.debug(f'after concat: {get_size_info(output)}')                

        # post fwd: free curr weights
        offload_layer_weights(model, layer_name, weight_map)
        return output

    layer.forward = new_forward
    return layer

def to_old_forward(model, layer_name):
    layer = get_module_from_name(model, layer_name) 

    if hasattr(layer, "_flexgen_old_forward"):
        layer.forward = layer._flexgen_old_forward
        delattr(layer, "_flexgen_old_forward")
        logger.debug(f'{layer_name} to old forward')
    return layer


layer_nums = len(layer_names)

for j in range(layer_nums):
    to_old_forward(model, layer_names[j])
    
# rewrite layers' forward
for j in range(layer_nums):
    compute_device = 'cpu'
    to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder, ngb, gbs)
    


2023-10-04 08:07:40,536 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.embed_tokens to flexgen forward
2023-10-04 08:07:40,537 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.embed_positions to flexgen forward
2023-10-04 08:07:40,538 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.final_layer_norm to flexgen forward
2023-10-04 08:07:40,539 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.layers.0 to flexgen forward
2023-10-04 08:07:40,540 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.layers.1 to flexgen forward
2023-10-04 08:07:40,541 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.layers.2 to flexgen forward
2023-10-04 08:07:40,542 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.layers.3 to flexgen forward
2023-10-04 08:07:40,543 [3008224673.py:73 in to_flexgen_forward] DEBUG - model.decoder.layers.4 to flexgen forward
2023-10-04 08:07:40,544 [3008224673.py:73 in to_flexgen_forwa

In [5]:
# generate test
from transformers import AutoTokenizer

prompts = [
    'Who are you? Are you conscious?',
    'Where is Deutschland?',
    'How is Huawei Mate 60 Pro?'
] 
prompts = prompts * (gbs * ngb // len(prompts)) + prompts[:(gbs * ngb % len(prompts))]

prompt_len = 10

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompts, padding="max_length", max_length=prompt_len, return_tensors="pt")

# Generate
generate_ids = model.generate(
    inputs.input_ids, 
    max_length=30 + prompt_len,
    # num_beams=2,
    # num_beam_groups=2,
    # diversity_penalty=0.1,
    do_sample=True,
)

output_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for output_text in output_texts:
    logging.info(output_text)
    logging.info('-' * 10)

2023-10-04 08:07:40,601 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
2023-10-04 08:07:40,786 [3008224673.py:7 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_tokens to cpu
2023-10-04 08:07:40,788 [3008224673.py:7 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_positions to cpu
2023-10-04 08:07:40,789 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.embed_tokens, batch: 0
2023-10-04 08:07:40,792 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.embed_tokens, batch: 1
2023-10-04 08:07:40,795 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.embed_tokens, batch: 2
2023-10-04 08:07:40,798 [3008224673.py:94 in new_forward] DEBUG - 

2023-10-04 08:07:41,134 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.layers.4, batch: 1
2023-10-04 08:07:41,144 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.layers.4, batch: 2
2023-10-04 08:07:41,152 [3008224673.py:94 in new_forward] DEBUG - layer: model.decoder.layers.4, batch: 3
2023-10-04 08:07:41,163 [3008224673.py:104 in new_forward] DEBUG - before concat: 4 x (torch.Size([8, 10, 768]), (torch.Size([8, 12, 10, 64]), torch.Size([8, 12, 10, 64])))
2023-10-04 08:07:41,166 [3008224673.py:106 in new_forward] DEBUG - after concat: (torch.Size([32, 10, 768]), (torch.Size([32, 12, 10, 64]), torch.Size([32, 12, 10, 64])))
2023-10-04 08:07:41,167 [3008224673.py:18 in offload_layer_weights] DEBUG - offload_layer_weights: model.decoder.layers.4
2023-10-04 08:07:41,171 [3008224673.py:7 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.layers.5 to cpu
2023-10-04 08:07:41,179 [3008224673.py:7 in load_layer_weights] DEBUG - load_layer_weights: mo