In [1]:
import numpy as np
import torch
import random
import datasets
import transformers

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    LlamaTokenizerFast,
    Trainer,
    DataCollatorForSeq2Seq,
    TrainingArguments
)
    
def set_seed(seed):
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
torch.randn((1, 2), device=)

tensor([[-0.6972,  0.1732]])

In [2]:
from optimum.onnxruntime import ORTModelForCausalLM

In [2]:
def get_wikitext2(tokenizer):
    testdata = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')       
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
    return testenc

In [4]:
# def get_wikitext2(nsamples, seed, seqlen, tokenizer):
#     traindata = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
#     testdata = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
        
#     trainenc = tokenizer("\n\n".join(traindata['text']))
#     testenc = tokenizer("\n\n".join(testdata['text']))

#     random.seed(seed)
#     trainloader = []
#     for _ in range(nsamples):
#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
#         j = i + seqlen
#         inp = trainenc.input_ids[:, i:j]
#         tar = inp.clone()
#         tar[:, :-1] = -100
#         trainloader.append((inp, tar))
#     return trainloader, testenc

In [5]:
@torch.no_grad()
def llama_eval(model, testenc, dev):
    '''
    Evaluating LLAMA-2 models on the test set.
    '''
    
    print('Evaluating ...')
    testenc = testenc.input_ids
    nsamples = testenc.numel() // model.seqlen

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {'i': 0, 'attention_mask': None}

    class Catcher(torch.nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps[cache['i']] = inp
            cache['i'] += 1
            cache['attention_mask'] = kwargs['attention_mask']
            cache['position_ids'] = kwargs['position_ids']
            raise ValueError
    layers[0] = Catcher(layers[0])
    for i in range(nsamples):
        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
        try:
            model(batch)
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache['attention_mask']
    position_ids = cache['position_ids']

    for i in range(len(layers)):
        if i == 0:
            print('Layers: 0', end='', flush=True)
        else:
            print(f', {i}', end='', flush=True)
        layer = layers[i].to(dev)

        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()
        inps, outs = outs, inps

    if model.model.norm is not None:
        model.model.norm = model.model.norm.to(dev)
    model.lm_head = model.lm_head.to(dev)

    testenc = testenc.to(dev)
    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i].unsqueeze(0)
        if model.model.norm is not None:
            hidden_states = model.model.norm(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = testenc[
            :, (i * model.seqlen):((i + 1) * model.seqlen)
        ][:, 1:]
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
    

    model.config.use_cache = use_cache
    
    return ppl.item()

In [3]:
# model_path = "/home/onnx_model/llama_3bit_128fp_after_fp_train"
# model_path = "/home/onnx_model/llama_3bit_128fp_after_fp_train_fp16"
# model_path = "/home/LLM_compression/QUIK/weights/llama_3bit_128fp_after_fp_train_quant_weight/checkpoint-600"
model_path = "/home/LLaMA/huggingface/Llama-2-7b-hf"
# model_seqlen = 2048
# model_seqlen = 12

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model_path = '/home/onnx_model/llama_3bit_128fp_after_fp_train'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, use_fast=False)

data_nsamples = 128
data_seed = 11
model_seqlen = 2048

testloader = get_wikitext2(tokenizer)

In [11]:
dev = model.device
# testenc = testenc.input_ids
testenc = testloader
nsamples = testenc.input_ids.numel() // model_seqlen

In [12]:
nlls = []
batch = {'input_ids': None, 'attention_mask': None, 'position_ids': None}
for i in range(nsamples):
   batch['input_ids'] = testenc.input_ids[:, (i * model_seqlen):((i + 1) * model_seqlen)].to(dev)
   batch['attention_mask'] = testenc.attention_mask[:, (i * model_seqlen):((i + 1) * model_seqlen)].to(dev)
   batch['position_ids'] = torch.arange(model_seqlen).unsqueeze(0).to(dev)

   lm_logits = model(**batch)

   shift_logits = lm_logits.logits[:, :-1, :].contiguous()
   shift_labels = batch['input_ids'][:, 1:]
   
   loss_fct = torch.nn.CrossEntropyLoss()
   loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

   neg_log_likelihood = loss.float() * model_seqlen
   nlls.append(neg_log_likelihood)

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacty of 23.64 GiB of which 18.50 MiB is free. Process 275869 has 23.62 GiB memory in use. Of the allocated memory 22.92 GiB is allocated by PyTorch, and 517.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model_seqlen))

In [None]:
ppl

In [14]:
# ppl

tensor(6.0623)

In [8]:
model = ORTModelForCausalLM.from_pretrained(
    model_path,
    use_cache=False,
    use_io_binding=False,
    cache_dir=None,
    device=device
)

In [72]:
model.__dict__

{'model': <onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x7fbe85883b80>,
 'config': LlamaConfig {
   "QuantizedLinear": {
     "is_quant_weight": true,
     "outlier_ids": {
       "0": {
         "mlp.down_proj": [
           10120,
           7430,
           3001,
           5006,
           295,
           7391,
           5084,
           4084,
           5461,
           2080,
           6851,
           4880,
           6706,
           5667,
           1704,
           2224,
           999,
           2177,
           6948,
           5393,
           8739,
           9381,
           10130,
           8217,
           7882,
           4555,
           7275,
           1522,
           4466,
           1355,
           10397,
           5034,
           7370,
           3359,
           949,
           4801,
           7940,
           3938,
           4124,
           870,
           7249,
           10206,
           4701,
           3766,
           

In [51]:
lm_logits = model(**batch)
lm_logits

CausalLMOutputWithPast(loss=None, logits=tensor([[[-4.7659,  0.5058,  4.2834,  ..., -1.5337, -3.2108,  2.5124],
         [ 0.6294,  6.2683, 13.7718,  ...,  1.8475,  7.7480, -0.8671],
         [-4.3826, -2.5058,  4.4802,  ..., -3.2572, -1.8184, -3.8962],
         ...,
         [-6.8862, -9.7180,  6.2615,  ..., -5.1282, -1.7487,  1.4506],
         [-3.8780, -6.7985,  8.7638,  ..., -0.2336,  1.3399, -1.6005],
         [-2.9872, -4.1416, 12.3725,  ..., -0.4952, -0.8791,  2.0194]]]), past_key_values=None, hidden_states=None, attentions=None)

In [53]:
shift_logits.view(-1, shift_logits.size(-1))

tensor([[-4.7659,  0.5058,  4.2834,  ..., -1.5337, -3.2108,  2.5124],
        [ 0.6294,  6.2683, 13.7718,  ...,  1.8475,  7.7480, -0.8671],
        [-4.3826, -2.5058,  4.4802,  ..., -3.2572, -1.8184, -3.8962],
        ...,
        [-4.2609, -6.9576,  8.2737,  ..., -2.5349, -1.4061,  1.4632],
        [-6.8862, -9.7180,  6.2615,  ..., -5.1282, -1.7487,  1.4506],
        [-3.8780, -6.7985,  8.7638,  ..., -0.2336,  1.3399, -1.6005]])

In [54]:
shift_labels.view(-1)

tensor([29871,    13,    13,  ...,   278, 14209,   297], device='cuda:0')

In [57]:
shift_labels

tensor([[29871,    13,    13,  ...,   278, 14209,   297]], device='cuda:0')

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)

In [44]:
neg_log_likelihood

tensor(3039.9438)

In [None]:
lm_logits = model.lm_head(hidden_states)
shift_logits = lm_logits[:, :-1, :].contiguous()
shift_labels = testenc[
    :, (i * model.seqlen):((i + 1) * model.seqlen)
][:, 1:]
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
neg_log_likelihood = loss.float() * model.seqlen
nlls.append(neg_log_likelihood)

In [12]:
layers = model.model.layers

model.model.embed_tokens = model.model.embed_tokens
layers[0] = layers[0]

dtype = next(iter(model.parameters())).dtype
inps = torch.zeros(
    (nsamples, model_seqlen, model.config.hidden_size), dtype=dtype
)
cache = {'i': 0, 'attention_mask': None}

In [13]:
class Catcher(torch.nn.Module):
    def __init__(self, module):
        super().__init__()
        self.module = module
    def forward(self, inp, **kwargs):
        inps[cache['i']] = inp
        cache['i'] += 1
        cache['attention_mask'] = kwargs['attention_mask']
        cache['position_ids'] = kwargs['position_ids']
        raise ValueError
layers[0] = Catcher(layers[0])

for i in range(nsamples):
    batch = testenc[:, (i * model_seqlen):((i + 1) * model_seqlen)].to(dev)
    try:
        model(batch)
    except ValueError:
        pass
layers[0] = layers[0].module

In [11]:
model.config.hidden_size

4096

In [18]:
attention_mask = cache['attention_mask']
position_ids = cache['position_ids']

KeyError: 'position_ids'

In [17]:
for j in range(nsamples):
    outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
layers[i] = layer.cpu()

NameError: name 'nsamples' is not defined

In [26]:
model_path = "/home/onnx_model/llama_3bit_128fp_after_fp_train"
    
model = ORTModelForCausalLM.from_pretrained(
    model_path,
    use_cache=False,
    use_io_binding=False,
    cache_dir=None,
    device='cuda'
)

In [30]:
torch.arange(5)

tensor([[0, 1, 2, 3, 4]])

In [32]:
batch

{'input_ids': tensor([[    1, 29871,    13,    13,   353,  4755,   350,  5059,   357,   353,
          29871,    13]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'position_ids': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]])}

In [33]:
model(**batch)

CausalLMOutputWithPast(loss=None, logits=tensor([[[ -4.7659,   0.5058,   4.2834,  ...,  -1.5337,  -3.2108,   2.5124],
         [  0.6294,   6.2683,  13.7718,  ...,   1.8475,   7.7480,  -0.8671],
         [ -4.3826,  -2.5058,   4.4802,  ...,  -3.2572,  -1.8184,  -3.8962],
         ...,
         [ -2.2191,  -3.5506,  14.8349,  ...,  -1.7047,   2.7506,  -2.7658],
         [ -0.7176,  -2.3732,  12.4163,  ...,   1.1010,   7.0395,  -1.6361],
         [-11.9019, -12.4825,   4.8030,  ...,  -7.4499,  -0.8307,  -2.5364]]]), past_key_values=None, hidden_states=None, attentions=None)

In [95]:
len(testloader.input_ids)

341469

In [93]:
for item in testloader:
    print(item)

input_ids
attention_mask


In [56]:
testenc = testloader

In [57]:
testenc

{'input_ids': tensor([[    1, 29871,    13,  ...,    13,    13,    13]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [26]:
testenc = testenc.input_ids
nsamples = testenc.numel() // model_seqlen

In [27]:
i = 0
batch = testenc[:, (i * model_seqlen):((i + 1) * model_seqlen)]

In [31]:
from transformers import pipeline

In [None]:
generator = pipeline(model="openai-community/gpt2")

In [None]:
oracle = pipeline(
    "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
)

In [38]:
testloader

{'input_ids': tensor([[    1, 29871,    13,  ...,    13,    13,    13]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [102]:
valdata = datasets.load_dataset(
    'allenai/c4', 'en', split='validation'
)

Downloading readme: 100%|██████████| 41.1k/41.1k [00:00<00:00, 19.3MB/s]
Downloading data: 100%|██████████| 319M/319M [00:33<00:00, 9.55MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:21<00:00, 14.7MB/s] 
Downloading data: 100%|██████████| 320M/320M [00:24<00:00, 13.0MB/s] 
Downloading data: 100%|██████████| 319M/319M [00:21<00:00, 14.5MB/s] 
Downloading data: 100%|██████████| 319M/319M [00:23<00:00, 13.3MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:24<00:00, 13.1MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:27<00:00, 11.4MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:23<00:00, 13.8MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:32<00:00, 9.76MB/s] 
Downloading data: 100%|██████████| 318M/318M [00:26<00:00, 12.2MB/s] 
Downloading data:  67%|██████▋   | 214M/319M [00:15<00:06, 17.2MB/s] 

In [58]:
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")

In [59]:
inputs

{'input_ids': tensor([[    1,  1724,   626,   306,   773, 29973,     1,  5293,  6652,   309,
         13635, 29911,   411,  6732, 29940, 29990, 24875, 29991]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [73]:
!optimum-cli export onnx --help

usage: optimum-cli export onnx [-h] -m MODEL [--task TASK] [--opset OPSET]
                               [--device DEVICE] [--fp16]
                               [--dtype {fp32,fp16,bf16}]
                               [--optimize {O1,O2,O3,O4}] [--monolith]
                               [--no-post-process] [--variant VARIANT]
                               [--framework {pt,tf}] [--atol ATOL]
                               [--cache_dir CACHE_DIR] [--trust-remote-code]
                               [--pad_token_id PAD_TOKEN_ID]
                               [--library-name {transformers,diffusers,timm,sentence_transformers}]
                               [--model-kwargs MODEL_KWARGS] [--legacy]
                               [--no-dynamic-axes] [--no-constant-folding]
                               [--batch_size BATCH_SIZE]
                               [--sequence_length SEQUENCE_LENGTH]
                               [--num_choices NUM_CHOICES] [--width WIDTH]
                

In [None]:
optimum-cli export onnx --model /home/LLM_compression/QUIK/weights/llama_3bit_128fp_after_fp_train_quant_weight/checkpoint-600 --device 'cuda' --dtype fp16 --task 'default' /home/onnx_model/llama_3bit_128fp_after_fp_train_bf16

In [64]:
model.forward(**inputs)

ValueError: position_ids was not passed but is a required input for this ONNX model.

In [60]:
gen_tokens = model.generate(
    **inputs,
    do_sample=True,
    temperature=0.9,
    max_length=4096,
    top_p=0.9
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [96]:
model

<optimum.onnxruntime.modeling_decoder.ORTModelForCausalLM at 0x7efa430f2d10>

In [61]:
gen_tokens

tensor([[    1,  1724,   626,   306,   773, 29973,     1,  5293,  6652,   309,
         13635, 29911,   411,  6732, 29940, 29990, 24875, 29991,     2]])

In [None]:
gen_tokens = model.generate(**inputs,do_sample=True,temperature=0.9, min_length=20,max_length=20)
tokenizer.batch_decode(gen_tokens)