In [1]:
import numpy as np
import torch
from torch.nn import Module 
import functools 

from flexgen_utils import logging, Policy, get_module_from_name

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

from flexgen_init import policy_init

2023-10-04 07:08:01,020 [instantiator.py:21 in <module>] INFO - Created a temporary directory at /tmp/tmphh60a97l
2023-10-04 07:08:01,022 [instantiator.py:76 in _write] INFO - Writing /tmp/tmphh60a97l/_remote_module_non_scriptable.py


In [2]:
checkpoint = "facebook/opt-125m"

policy = Policy(
    gpu_batch_size=8, 
    num_gpu_batches=4, 
    weights_gpu_percent=0.0, 
    weights_cpu_percent=0.3, 
    cache_gpu_percent=0.0, 
    cache_cpu_percent=0.2, 
    act_gpu_percent=0.0, 
    act_cpu_percent=0.5, 
    overlap=True, 
    pin_weight=True,
)

# for test
gbs = policy.gpu_batch_size
ngb = policy.num_gpu_batches
num_prompts = ngb * gbs 

# model init
output = policy_init(checkpoint, policy)

model = output.model
weight_map = output.weight_map
layer_names = output.layer_names
index = output.index
dat_files = output.dat_files
tied_params = output.tied_params
offload_folder = output.offload_folder

2023-10-04 07:08:02,015 [connectionpool.py:1003 in _new_conn] DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2023-10-04 07:08:02,074 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/config.json HTTP/1.1" 200 0
2023-10-04 07:08:02.753724: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-04 07:08:03,729 [tpu_cluster_resolver.py:32 in <module>] DEBUG - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2023-10-04 07:08:03,910 [__init__.py:47 in <module>] DEBUG - Creating converter from 7 to 5
2023-10-04 07:08:03,912 [__init__.py:47 in <module>] DEBUG - Creating converter from 5 to 7
2023-10-04 07:

In [18]:
import os 
from accelerate.utils import named_module_tensors 
from flexgen_utils import get_tied_target
from flexgen_utils import flexgen_load_module_tensor, flexgen_offload_module_tensor

def load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files):
    logger.debug(f'load_layer_weights: {layer_name} to {compute_device}')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    layer_dat_files = [os.path.join(offload_folder, get_tied_target(w, tied_params, dat_files) + '.dat') for w in weight_names]
    assert all([os.path.isfile(f) for f in layer_dat_files]), f'dat file error, {dat_files}'
    
    for w in weight_names:
        flexgen_load_module_tensor(model, w, compute_device, index, offload_folder, tied_params)


def offload_layer_weights(model, layer_name, weight_map):
    logger.debug(f'offload_layer_weights: {layer_name}')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    for w in weight_names:
        flexgen_offload_module_tensor(model, w, weight_map) 

def get_info(obj): # recursive
    if isinstance(obj, tuple):
        return tuple(get_info(o) for o in obj)
    elif isinstance(obj, dict):
        return {k:get_info(v) for k, v in obj.items()}
    elif isinstance(obj, torch.Tensor):
        return obj.size()
    else:
        return type(obj)

def get_kth_batch_inputs(inputs, k, gpu_batch_size): # for both args, kwargs
    if isinstance(inputs, tuple):
        return tuple(get_kth_batch_inputs(inp, k, gpu_batch_size) for inp in inputs)
    elif isinstance(inputs, dict):
        return {k:get_kth_batch_inputs(v, k, gpu_batch_size) for k, v in inputs.items()}
    elif isinstance(inputs, torch.Tensor):
        return inputs[k * gpu_batch_size:(k + 1) * gpu_batch_size]
    else: # int 
        return inputs

def concat_outputs(outputs, ): # concat K outputs to one output
    ans = []
    for elem in zip(*outputs):
        if isinstance(elem[0], torch.Tensor):
            ans.append(torch.cat(elem, dim=0))
        elif isinstance(elem[0], tuple):
            ans.append(tuple(concat_outputs(elem)))
        else: # all the same
            ans.append(elem[0])
    return ans 

def to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder):
    # rewrite the j-th layer's forward
    
    layer_name = layer_names[j]
    next_layer_name = layer_names[(j + 1) % len(layer_names)]

    layer = get_module_from_name(model, layer_name)  
    if hasattr(layer, "_flexgen_old_forward"): # has been rewriten
        return layer 
    
    logger.debug(f'{layer_name} to flexgen forward')
    layer._flexgen_old_forward = old_forward = layer.forward 

    @functools.wraps(old_forward)
    def new_forward(*args, **kwargs):
        # pre fwd: load curr & next weights
        load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files)
        load_layer_weights(model, next_layer_name, compute_device, offload_folder, dat_files)
        
        # loop forward pass of K minibatches
        with torch.no_grad():
            outputs = []
            for k in range(ngb):
                logger.debug(f'layer: {layer_name}, {k}-th gpu batch')
                args_k = get_kth_batch_inputs(args, k, gbs)
                kwargs_k = get_kth_batch_inputs(kwargs, k, gbs)
                output = old_forward(*args_k, **kwargs_k)
                outputs.append(output) 
            
            output = concat_outputs(outputs)
                

        # post fwd: free curr weights
        offload_layer_weights(model, layer_name, weight_map)
        return output

    layer.forward = new_forward
    return layer

def to_old_forward(model, layer_name):
    layer = get_module_from_name(model, layer_name) 

    if hasattr(layer, "_flexgen_old_forward"):
        layer.forward = layer._flexgen_old_forward
        delattr(layer, "_flexgen_old_forward")
    logger.debug(f'{layer_name} to old forward')
    return layer


In [21]:

# rewrite layers' forward
layer_nums = len(layer_names)
for j in range(layer_nums):
    compute_device = 'cpu'
    to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder)
    

2023-10-04 07:21:16,140 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.embed_tokens to flexgen version forward
2023-10-04 07:21:16,143 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.embed_positions to flexgen version forward
2023-10-04 07:21:16,144 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.final_layer_norm to flexgen version forward
2023-10-04 07:21:16,145 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.layers.0 to flexgen version forward
2023-10-04 07:21:16,147 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.layers.1 to flexgen version forward
2023-10-04 07:21:16,148 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.layers.2 to flexgen version forward
2023-10-04 07:21:16,150 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.layers.3 to flexgen version forward
2023-10-04 07:21:16,151 [3075400243.py:65 in to_flexgen_forward] DEBUG - model.decoder.layers.4 to flexgen version forwa

In [20]:
for j in range(layer_nums):
    to_old_forward(model, layer_names[j])
    

2023-10-04 07:21:13,227 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.embed_tokens to old forward
2023-10-04 07:21:13,229 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.embed_positions to old forward
2023-10-04 07:21:13,230 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.final_layer_norm to old forward
2023-10-04 07:21:13,231 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.0 to old forward
2023-10-04 07:21:13,232 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.1 to old forward
2023-10-04 07:21:13,233 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.2 to old forward
2023-10-04 07:21:13,234 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.3 to old forward
2023-10-04 07:21:13,235 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.4 to old forward
2023-10-04 07:21:13,236 [3075400243.py:100 in to_old_forward] DEBUG - model.decoder.layers.5 to old forward
2023-10-0

In [22]:
# generate test
from transformers import AutoTokenizer

prompts = [
    'Who are you? Are you conscious?',
    'Where is Deutschland?',
    'How is Huawei Mate 60 Pro?'
] 
prompts = prompts * (gbs * ngb // len(prompts)) + prompts[:(gbs * ngb % len(prompts))]

prompt_len = 10

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompts, padding="max_length", max_length=prompt_len, return_tensors="pt")

# Generate
generate_ids = model.generate(
    inputs.input_ids, 
    max_length=30 + prompt_len,
    # num_beams=2,
    # num_beam_groups=2,
    # diversity_penalty=0.1,
    do_sample=True,
)

output_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for output_text in output_texts:
    logging.info(output_text)
    logging.info('-' * 10)

2023-10-04 07:21:32,146 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
2023-10-04 07:21:32,352 [3075400243.py:7 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_tokens to cpu
2023-10-04 07:21:32,355 [3075400243.py:7 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_positions to cpu
2023-10-04 07:21:32,356 [3075400243.py:78 in new_forward] DEBUG - layer: model.decoder.embed_tokens, 0-th gpu batch
2023-10-04 07:21:32,359 [3075400243.py:78 in new_forward] DEBUG - layer: model.decoder.embed_tokens, 1-th gpu batch
2023-10-04 07:21:32,363 [3075400243.py:78 in new_forward] DEBUG - layer: model.decoder.embed_tokens, 2-th gpu batch
2023-10-04 07:21:32,366 [3075400243.py:78 in new_forward] DEBUG - layer: model.decoder.embed_tokens, 3-th gpu batch
2023-10-04 07:21:32,370 [3075400243.py:18 in offload_layer_weights] DEBUG - offload_layer_weights: model.dec

AttributeError: 'list' object has no attribute 'dtype'