In [1]:
import torch
from transformers import AutoTokenizer, OPTForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import dataclasses
import os
import numpy as np
from tqdm import tqdm

In [4]:
@dataclasses.dataclass(frozen=True)
class Policy:
    gpu_batch_size: int
    num_gpu_batches: int

    # percent = a means a%
    w_gpu_percent: float
    w_cpu_percent: float
    cache_gpu_percent: float
    cache_cpu_percent: float
    act_gpu_percent: float
    act_cpu_percent: float

    # Whether to overlap the I/O and compute
    overlap: bool

    # Whether to separate attention and mlp as two layers
    # sep_layer: bool

    # Whether to use pinned memory for weights on CPU
    pin_weight: bool

    # Whether to compute attention on CPU
    # cpu_cache_compute: bool

    # Sparsity of attention weights
    # attn_sparsity: float

    # Compress weights with group-wise quantization
    # compress_weight: bool
    # comp_weight_config: CompressionConfig

    # Compress KV cache with group-wise quantization
    # compress_cache: bool
    # comp_cache_config: CompressionConfig

    @property
    def w_disk_percent(self):
        return 100 - self.w_gpu_percent - self.w_cpu_percent

    @property
    def cache_disk_percent(self):
        return 100 - self.cache_gpu_percent - self.cache_cpu_percent

    @property
    def act_disk_percent(self):
        return 100 - self.act_gpu_percent - self.act_cpu_percent



In [5]:
policy = Policy(
    gpu_batch_size=4, 
    num_gpu_batches=12, 
    w_gpu_percent=20, 
    w_cpu_percent=30, 
    cache_gpu_percent=20, 
    cache_cpu_percent=30, 
    act_gpu_percent=0, 
    act_cpu_percent=100, 
    overlap=True,
    pin_weight=True,
)


In [6]:
model = OPTForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

In [7]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [8]:
def model_pre_hook(module, args, kwargs):
    # print(args, kwargs)
    display = {}
    for k, v in kwargs.items():
        if isinstance(v, torch.Tensor):
            display[k] = v.shape 
        elif k == 'past_key_values' and v is not None:
            display[k] = f'{len(v)} * ' + str((v[0][0].shape, v[0][1].shape))
        else:
            display[k] = v
    print(display)

    return args, kwargs

model._forward_pre_hooks.clear()
model.register_forward_pre_hook(model_pre_hook, with_kwargs=True)
model._forward_pre_hooks

# def layer_pre_hook(module, args, kwargs):
#     display = {}
#     for k, v in kwargs.items():
#         if isinstance(v, torch.Tensor):
#             display[k] = v.shape 
#         elif k in ['past_key_value', 'layer_past'] and v is not None:
#             display[k] = str((v[0].shape, v[1].shape))
#         else:
#             display[k] = v
#     print(display)

#     return args, kwargs

# layer = model.model.decoder.layers[2]

# layer._forward_pre_hooks.clear()
# layer.register_forward_pre_hook(layer_pre_hook, with_kwargs=True)
# layer._forward_pre_hooks


OrderedDict([(0, <function __main__.model_pre_hook(module, args, kwargs)>)])

In [12]:
prompts = [
    'Who are you? Are you conscious?',
    'Where is Deutschland?',
    'How is Huawei Mate 60 Pro?'
] * 4

prompt_len = 20

inputs = tokenizer(prompts, padding="max_length", max_length=prompt_len, return_tensors="pt")


# Generate
generate_ids = model.generate(
    inputs.input_ids, 
    pad_token_id=tokenizer.eos_token_id,
    max_length=30 + prompt_len,
    # num_beams=2,
    # num_beam_groups=2,
    # diversity_penalty=0.1,
    # do_sample=True,
)

output_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for output_text in output_texts:
    print(output_text)
    print('-' * 10)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


{'input_ids': torch.Size([12, 20]), 'past_key_values': None, 'use_cache': True, 'position_ids': torch.Size([12, 20]), 'attention_mask': torch.Size([12, 20]), 'token_type_ids': None, 'return_dict': True, 'output_attentions': False, 'output_hidden_states': False}
{'input_ids': torch.Size([12, 1]), 'past_key_values': '20 * (torch.Size([12, 16, 20, 64]), torch.Size([12, 16, 20, 64]))', 'use_cache': True, 'position_ids': torch.Size([12, 1]), 'attention_mask': torch.Size([12, 21]), 'token_type_ids': None, 'return_dict': True, 'output_attentions': False, 'output_hidden_states': False}
{'input_ids': torch.Size([12, 1]), 'past_key_values': '20 * (torch.Size([12, 16, 21, 64]), torch.Size([12, 16, 21, 64]))', 'use_cache': True, 'position_ids': torch.Size([12, 1]), 'attention_mask': torch.Size([12, 22]), 'token_type_ids': None, 'return_dict': True, 'output_attentions': False, 'output_hidden_states': False}
{'input_ids': torch.Size([12, 1]), 'past_key_values': '20 * (torch.Size([12, 16, 22, 64]), t