In [1]:
import numpy as np
import torch
from torch.nn import Module 
import functools 

from flexgen_utils import logging, Policy, get_module_from_name

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

from flexgen_init import policy_init

2023-10-04 09:29:58,047 [instantiator.py:21 in <module>] INFO - Created a temporary directory at /tmp/tmpisnd463d
2023-10-04 09:29:58,049 [instantiator.py:76 in _write] INFO - Writing /tmp/tmpisnd463d/_remote_module_non_scriptable.py


In [2]:
checkpoint = "facebook/opt-125m" # 125m 6.7b 13b 30b

policy = Policy(
    gpu_batch_size=2, 
    num_gpu_batches=4, 
    weights_gpu_percent=0.0, 
    weights_cpu_percent=0.3, 
    cache_gpu_percent=0.0, 
    cache_cpu_percent=0.2, 
    act_gpu_percent=0.0, 
    act_cpu_percent=0.5, 
    overlap=True, 
    pin_weight=True,
)

# for test
gbs = policy.gpu_batch_size
ngb = policy.num_gpu_batches
num_prompts = ngb * gbs 

# model init
output = policy_init(checkpoint, policy)

model = output.model
weight_map = output.weight_map
layer_names = output.layer_names
index = output.index
dat_files = output.dat_files
tied_params = output.tied_params
offload_folder = output.offload_folder

2023-10-04 09:29:58,493 [connectionpool.py:1003 in _new_conn] DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2023-10-04 09:29:58,561 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/config.json HTTP/1.1" 200 0
2023-10-04 09:29:59.266711: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-04 09:30:00,279 [tpu_cluster_resolver.py:32 in <module>] DEBUG - Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
2023-10-04 09:30:00,481 [__init__.py:47 in <module>] DEBUG - Creating converter from 7 to 5
2023-10-04 09:30:00,483 [__init__.py:47 in <module>] DEBUG - Creating converter from 5 to 7
2023-10-04 09:

In [3]:
import os 
from accelerate.utils import named_module_tensors 
from flexgen_utils import get_tied_target
from flexgen_utils import flexgen_load_module_tensor, flexgen_offload_module_tensor
from flexgen_minibatch import get_size_info, get_kth_batch_inputs, concat_outputs

def load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files):
    logger.debug(f'load_layer_weights: {layer_name} to {compute_device}')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    layer_dat_files = [os.path.join(offload_folder, get_tied_target(w, tied_params, dat_files) + '.dat') for w in weight_names]
    assert all([os.path.isfile(f) for f in layer_dat_files]), f'dat file error, {dat_files}'
    
    for w in weight_names:
        flexgen_load_module_tensor(model, w, compute_device, index, offload_folder, tied_params)


def offload_layer_weights(model, layer_name, weight_map):
    logger.debug(f'offload_layer_weights: {layer_name}\n\n')
    layer_module = get_module_from_name(model, layer_name)
    weight_names = [layer_name + '.' + name for name, _ in named_module_tensors(layer_module, True, True)]
    for w in weight_names:
        flexgen_offload_module_tensor(model, w, weight_map) 


def to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder, ngb, gbs):
    # rewrite the j-th layer's forward
    
    layer_name = layer_names[j]
    next_layer_name = layer_names[(j + 1) % len(layer_names)]

    layer = get_module_from_name(model, layer_name)  
    if hasattr(layer, "_flexgen_old_forward"): # has been rewriten
        return layer 
    
    logger.debug(f'{layer_name} to flexgen forward')
    layer._flexgen_old_forward = old_forward = layer.forward 

    @functools.wraps(old_forward)
    def new_forward(*args, **kwargs):
        # pre fwd: load curr & next weights, TODO: cuda stream
        load_layer_weights(model, layer_name, compute_device, offload_folder, dat_files)
        load_layer_weights(model, next_layer_name, compute_device, offload_folder, dat_files)
        
        # loop forward pass of K minibatches, TODO: cuda stream
        with torch.no_grad():
            logger.debug(f'args: {get_size_info(args)}')
            logger.debug(f'kwargs: {get_size_info(kwargs)}')
            # output = old_forward(*args, **kwargs)
            # logger.debug(f'output: {get_size_info(output)}')

            args_0 = get_kth_batch_inputs(args, 0, gbs)
            kwargs_0 = get_kth_batch_inputs(kwargs, 0, gbs)
            logger.debug(f'args_0: {get_size_info(args_0)}')
            logger.debug(f'kwargs_0: {get_size_info(kwargs_0)}')
            # output_0 = old_forward(*args_0, **kwargs_0)
            # logger.debug(f'output0: {get_size_info(output_0)}')

            outputs = []
            for k in range(ngb):
                logger.debug(f'layer: {layer_name}, batch: {k}')

                # 'pre' fwd: load curr & next inputs (activations, KV cache), store & offload prev 
                args_k = get_kth_batch_inputs(args, k, gbs)
                kwargs_k = get_kth_batch_inputs(kwargs, k, gbs)

                # the k-th fwd pass
                output = old_forward(*args_k, **kwargs_k)
                outputs.append(output) 
                
                # 'post' fwd: offload curr inputs

            logger.debug(f'outputs before concat: {ngb} x {get_size_info(outputs[0])}')
            output = concat_outputs(outputs)
            logger.debug(f'outputs after concat: {get_size_info(output)}')                

        # post fwd: free curr weights
        offload_layer_weights(model, layer_name, weight_map)
        return output

    layer.forward = new_forward
    return layer


def to_old_forward(model, layer_name):
    layer = get_module_from_name(model, layer_name) 

    if hasattr(layer, "_flexgen_old_forward"):
        layer.forward = layer._flexgen_old_forward
        delattr(layer, "_flexgen_old_forward")
        logger.debug(f'{layer_name} to old forward')
    return layer


layer_nums = len(layer_names)

for j in range(layer_nums):
    to_old_forward(model, layer_names[j])
    
# rewrite layers' forward
for j in range(layer_nums):
    compute_device = 'cpu'
    to_flexgen_forward(model, layer_names, j, compute_device, weight_map, offload_folder, ngb, gbs)
    


2023-10-04 09:30:01,919 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.embed_tokens to flexgen forward
2023-10-04 09:30:01,921 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.embed_positions to flexgen forward
2023-10-04 09:30:01,922 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.final_layer_norm to flexgen forward
2023-10-04 09:30:01,923 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.layers.0 to flexgen forward
2023-10-04 09:30:01,924 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.layers.1 to flexgen forward
2023-10-04 09:30:01,925 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.layers.2 to flexgen forward
2023-10-04 09:30:01,926 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.layers.3 to flexgen forward
2023-10-04 09:30:01,927 [2440752808.py:36 in to_flexgen_forward] DEBUG - model.decoder.layers.4 to flexgen forward
2023-10-04 09:30:01,928 [2440752808.py:36 in to_flexgen_forwa

In [4]:
# generate test
from transformers import AutoTokenizer

prompts = [
    'Who are you? Are you conscious?',
    'Where is Deutschland?',
    'How is Huawei Mate 60 Pro?'
] 
prompts = prompts * (gbs * ngb // len(prompts)) + prompts[:(gbs * ngb % len(prompts))]

prompt_len = 10

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompts, padding="max_length", max_length=prompt_len, return_tensors="pt")

# Generate
generate_ids = model.generate(
    inputs.input_ids, 
    max_length=30 + prompt_len,
    # num_beams=2, #
    # num_beam_groups=2, #
    # diversity_penalty=0.1, #
    do_sample=True, #
)

output_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for output_text in output_texts:
    logging.info(output_text)
    logging.info('-' * 10)

2023-10-04 09:30:01,984 [connectionpool.py:456 in _make_request] DEBUG - https://huggingface.co:443 "HEAD /facebook/opt-125m/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
2023-10-04 09:30:02,225 [2440752808.py:8 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_tokens to cpu
2023-10-04 09:30:02,227 [2440752808.py:8 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.embed_positions to cpu
2023-10-04 09:30:02,228 [2440752808.py:47 in new_forward] DEBUG - args: (torch.Size([8, 10]),)
2023-10-04 09:30:02,229 [2440752808.py:48 in new_forward] DEBUG - kwargs: {}
2023-10-04 09:30:02,230 [2440752808.py:54 in new_forward] DEBUG - args_0: (torch.Size([2, 10]),)
2023-10-04 09:30:02,231 [2440752808.py:55 in new_forward] DEBUG - kwargs_0: {}
2023-10-04 09:30:02,232 [2440752808.py:61 in new

2023-10-04 09:30:03,562 [2440752808.py:8 in load_layer_weights] DEBUG - load_layer_weights: model.decoder.layers.7 to cpu
2023-10-04 09:30:03,570 [2440752808.py:47 in new_forward] DEBUG - args: (torch.Size([8, 1, 768]),)
2023-10-04 09:30:03,571 [2440752808.py:48 in new_forward] DEBUG - kwargs: {'attention_mask': torch.Size([8, 1, 1, 11]), 'layer_head_mask': None, 'past_key_value': (torch.Size([8, 12, 10, 64]), torch.Size([8, 12, 10, 64])), 'output_attentions': False, 'use_cache': True}
2023-10-04 09:30:03,572 [2440752808.py:54 in new_forward] DEBUG - args_0: (torch.Size([2, 1, 768]),)
2023-10-04 09:30:03,573 [2440752808.py:55 in new_forward] DEBUG - kwargs_0: {'attention_mask': torch.Size([2, 1, 1, 11]), 'layer_head_mask': None, 'past_key_value': (torch.Size([2, 12, 10, 64]), torch.Size([2, 12, 10, 64])), 'output_attentions': False, 'use_cache': True}
2023-10-04 09:30:03,573 [2440752808.py:61 in new_forward] DEBUG - layer: model.decoder.layers.6, batch: 0
2023-10-04 09:30:03,579 [24407