## Задание 1. (10 баллов)

Дообучите языковую модель на датасете инструкций, используя LoRA. Проверьте, что дообученная модель отличается от изначальной - сгенерируйте продолжения для одних и тех же промптов и сравните результаты.

Вы можете взять за основу код семинара PEFT, изменив датасет цитат на датасет инструкций (можно просто скопировать из семинара про General_instruct_fine-tuning). 
Можно использовать alpaca_dataset, датасет Dolly 2 или переведенный датасет (или все вместе). 
Важно использовать модель с большим количеством параметров (относительно семинара по General instruct fine-tuning). 
Размер модели должен быть как минимум 3 млрд параметров.  
**Нужно использовать модель, которую мы не разбирали на семинаре (OPT-2.7b, OPT-6.7b). Найдите новую модель на huggingface hub.**



In [None]:
!  pip install hqq

In [None]:
! huggingface-cli login

С моей GPU дефолтная квантизация в bitsandbytes не поддерживается, так что либо  

In [1]:
import gc
import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.models.hf.llama import LlamaHQQ
from datasets import load_dataset
import torch, time
import numpy as np
from tqdm import tqdm

from hqq.core.quantize import *
from hqq.core.peft import PeftUtils
from hqq.core.quantize import *

In [2]:
#Model and setttings
model_id      = 'meta-llama/Llama-2-7b-chat-hf'
compute_dtype = torch.float32
device        = 'cuda:0'

In [3]:
def predict_for_instruction(instruct, text, model):  

    inputs = tokenizer([instruct.format(text)], 
                        return_tensors="pt",).to(model.device)
    with torch.no_grad():
        with torch.autocast("cuda"):
        
            output_sequences = model.generate(
                # this parameters are also important but you can read about them in the docs and just try changing them
                num_beams=5,
                max_length=1024,
            # no_repeat_ngram_size=3, 
            repetition_penalty= 3.0,
            length_penalty=0.01,
            do_sample=True, 
            temperature=1.5,
            # top_k=15, 
            # top_p=0.8, 
            early_stopping=True,
            num_return_sequences=3,
            # num_return_sequences=1,
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            )
    summaries = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    return summaries

In [4]:
def cleanup():
	torch.cuda.empty_cache()
	gc.collect()


def eval_wikitext2(model, tokenizer, max_length=1024, stride=512, verbose=True):

    model.eval()
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.add_eos_token = False

    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    encodings = tokenizer('\n\n'.join(dataset['text']), return_tensors='pt')

    encodings['input_ids'] = encodings['input_ids'].to('cuda')

    lls, t = [], []
    for i in tqdm(range(0, encodings['input_ids'].size(1), stride), disable=not verbose):
        begin_loc  = max(i + stride - max_length, 0)
        end_loc    = min(i + stride, encodings['input_ids'].size(1))
        trg_len    = end_loc - i  
        input_ids  = encodings['input_ids'][:,begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100 #ignore context 

        t1 = time.time()
        with torch.no_grad():
            with torch.autocast("cuda"):
                log_likelihood = model(input_ids, labels=target_ids).loss * trg_len
        torch.cuda.synchronize()
        t2 = time.time()
        t.append((t2-t1))
        lls.append(log_likelihood)

        del input_ids, target_ids

    ppl = np.round(float(torch.exp(torch.stack(lls).sum() / end_loc)), 4)
    pred_time = np.round(np.mean(t), 3)
    if(verbose):
        print('perplexity', ppl)
        print('time', str(pred_time) + '  sec')

    del encodings
    cleanup()

    return {'perplexity':ppl, 'prediction_time':pred_time}

In [18]:
#Load model on the CPU
######################
model     = HQQModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir="./models")
tokenizer = AutoTokenizer.from_pretrained(model_id) 

#Quantize the model
######################

quant_config = BaseQuantizeConfig(nbits=2, group_size=8, 
                                  offload_meta=True,)
model.quantize_model(quant_config=quant_config, compute_dtype=compute_dtype, device=device) 



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 32/32 [00:00<00:00, 1067.07it/s]
100%|██████████| 32/32 [00:56<00:00,  1.77s/it]


In [19]:
save_dir = "./models/quantized/llama-2-7b-2bit_8-group"

#Save the quantized model
model.save_quantized(save_dir=save_dir)

In [5]:
save_dir = "models/quantized/llama-2-7b-2bit_8-group"

tokenizer = AutoTokenizer.from_pretrained(model_id) 
model = LlamaHQQ.from_quantized(save_dir_or_hub=save_dir,)

100%|██████████| 32/32 [00:00<00:00, 769.83it/s]
100%|██████████| 32/32 [00:01<00:00, 26.48it/s]


In [7]:
instruct = "Write a recipe, how would you do an omlette."
text = ""


In [None]:
instruct = "

### 2 bit -- group 32 model

In [17]:
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["Write a recipe, how would you do an omlette.\nYou don't have to write a recipe, but I will give you the freedom to write as much or as little detail as you like. Let me know if you need further clarification.",
 "Write a recipe, how would you do an omlette.\nYou don't have to write a recipe, but I will give you the freedom to write as much or as little detail as you like. Let me know if you want to add anything else.",
 "Write a recipe, how would you do an omlette.\nYou don't have to write a recipe, but I will give you the freedom to write as much or as little detail as you like. Let me know if you need further clarification. 🙂"]

In [18]:
# quantized model ppl
eval_wikitext2(model, tokenizer, verbose=True) 

100%|██████████| 667/667 [2:33:11<00:00, 13.78s/it]  

perplexity 21.7443
time 13.779  sec





{'perplexity': 21.7443, 'prediction_time': 13.779}

### 2 bit -- group 8 model

In [7]:
instruct = "Write a recipe, how would you do an omlette."
text = ""


In [26]:
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Write a recipe, how would you do an omlette.\nIf I were to write a recipe for an omelette, it would look something like this:\nIngredients:\n\n* 2 eggs\n* 1/4 cup diced vegetables (such as bell peppers, onions, and mushrooms)\n* 1 tablespoon butter or oil\n* Salt and pepper to taste\n\nInstructions:\n\n1. Preheat a non-stick pan over medium heat.\n2. In a bowl, beat the eggs and set aside.\n3. Add the diced vegetables to the pan and cook until they are tender, about 5 minutes.\n4. Pour the beaten eggs over the vegetables in the pan and cook until the eggs are set, about 2-3 minutes.\n5. Use a spatula to gently fold the edges of the omelette towards the center, allowing the uncooked egg to flow to the edges.\n6. Cook until the eggs are fully cooked and the omelette is golden brown, about 1-2 minutes more.\n7. Slide the omelette onto a plate and serve hot.\n\nNote: You can customize this recipe by adding different ingredients such as cheese, herbs, or spices to suit your taste.',
 'Wri

In [27]:
instruct = "Write a recipe, how would you do an omlette for the dark lord."
text = ""


In [28]:
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["Write a recipe, how would you do an omlette for the dark lord.\nBy Dark Lord of Caerbannog, at 12:09 PM, February 07, 2008\nHaha, I love it! Here's a recipe for an omelette fit for the Dark Lord of Caerbannog:\n\nIngredients:\n\n* 2 eggs\n* 1/4 cup of dark, evil cheese (such as Gouda or Cheddar)\n* 1/4 cup of minced dark mushrooms (for added darkness)\n* 1/4 cup of chopped dark herbs (such as thyme or rosemary)\n* 1/4 teaspoon of dark spices (such as cumin or coriander)\n* Salt and pepper to taste\n\nInstructions:\n\n1. Preheat your oven to 350 degrees Fahrenheit.\n2. In a bowl, whisk together the eggs and dark, evil cheese until well combined.\n3. Add the minced dark mushrooms, chopped dark herbs, and dark spices to the bowl and mix well.\n4. Heat a non-stick pan over medium heat and add a small amount of oil.\n5. Pour the egg mixture into the pan and cook until the edges start to set, about 2-3 minutes.\n6. Use a spatula to carefully lift the edges of the omelette and allow the unc

In [29]:
# в принципе лама обучающем датасете Ламы были и русскоязычные примеры, так что можно проверить её способности
instruct = "Напиши рецепт карри в китайском стиле с морепродуктами."
text = ""


In [30]:
# пока выглядит не очень -- в особенности с переходом на латиницу 
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Напиши рецепт карри в китайском стиле с морепродуктами.\n\nКарри в китайском стиле можно приготовить из различных морепродуктов, таких как кальмары, осья, креветки и т.д. Вот рецепт классического карри в китайском стиле с использованием морепродуктов:\n\nИнгредиенты:\n\n* 500 г морепродуктов (кальмары, осья, креветки и т.д.)\n* 1/2 стакана растительного масла\n* 1/4 стакана тоfu\n* 1/4 стакана риса\n* 1/4 стакана сушеного мяса (optional)\n* 1 tablespoon soy sauce\n* 1 tablespoon hoisin sauce\n* 1 tablespoon rice vinegar\n* 1 tablespoon honey\n* 1 teaspoon sesame oil\n* Salt and pepper to taste\n\nInstructions:\n\n1. Clean and cut the seafood into bite-sized pieces.\n2. Heat the vegetable oil in a wok or large skillet over medium-high heat.\n3. Add the seafood and cook until it is lightly browned and crispy.\n4. Remove the seafood from the wok and set aside.\n5. In a small bowl, mix together the soy sauce, hoisin sauce, rice vinegar, honey, and sesame oil.\n6. Add the mixture to the w

In [61]:
# quantized model ppl
eval_wikitext2(model, tokenizer, verbose=True) 

100%|██████████| 667/667 [3:02:14<00:00, 16.39s/it]  


perplexity 7.7676
time 16.384  sec


{'perplexity': 7.7676, 'prediction_time': 16.384}

In [62]:
cleanup()

Not that bad -- 7.76 is alright for our task, let's stick to it

### 2 bit, group 8 model + Lora

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [6]:
cache_path = './models'

model_id  = "meta-llama/Llama-2-7b-hf" 
tokenizer = AutoTokenizer.from_pretrained(model_id) 

save_dir = "models/quantized/llama-2-7b-2bit_8-group"
model = LlamaHQQ.from_quantized(save_dir_or_hub=save_dir)

100%|██████████| 32/32 [00:00<00:00, 785.41it/s]
100%|██████████| 32/32 [00:01<00:00, 25.73it/s]


In [7]:
train_dtype = torch.float32 
base_lora_params = {'lora_type':'default', 'r': 10, 'lora_alpha':10, 'dropout':0.05, 'train_dtype':train_dtype}
mlp_lora_params = {'lora_type': 'default', 'r': 6, 'lora_alpha':8, 'dropout':0.15, 'train_dtype':train_dtype}
lora_params = {'self_attn.q_proj': base_lora_params,
		    'self_attn.k_proj': base_lora_params,
		    'self_attn.v_proj': base_lora_params,
		    'self_attn.o_proj': base_lora_params,
		    'mlp.gate_proj'   : mlp_lora_params,
		    'mlp.up_proj'     : mlp_lora_params,
		    'mlp.down_proj'   : mlp_lora_params}

#Apply LoRA
PeftUtils.add_lora(model, lora_params, )

100%|██████████| 32/32 [00:00<00:00, 127.15it/s]


In [8]:
print_trainable_parameters(model)

trainable params: 19185664 || all params: 1900597248 || trainable%: 1.0094544764909603


In [None]:
from datasets import load_dataset
from tqdm import tqdm
import transformers
import numpy as np 
import random
from typing import Sequence
from dataclasses import dataclass
import logging
import wandb

logger = logging.getLogger(__name__)
logger.setLevel("INFO")

tokenizer.pad_token     = tokenizer.eos_token 
tokenizer.padding_side  = "right" 

max_tokens = 256 

In [10]:
wandb.login()

wandb.init(
    name="2bit lora instruct"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxenomirant[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
PROMPT_DICT = {
    "prompt_input": (
        "### Инструкция:\n{instruction}\n\n### Контекст:\n{input}\n\n### Ответ: "
    ),
    "prompt_no_input": (
        "### Инструкция:\n{instruction}\n\n### Ответ: "
    ),
}

In [14]:
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            max_length=max_tokens,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

In [15]:
IGNORE_INDEX = -100

def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> dict:
    """Preprocess the data by tokenizing."""
    # cat targets with outputs
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    # set up labels for text2text
    labels = copy.deepcopy(input_ids)
    # change label ids whithin source length to ignore during loss computation
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)



In [16]:
class SupervisedDataset(torch.utils.data.Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data):
        super().__init__()
        logger.warning("Loading data...")
        list_data_dict = data

        logger.warning("Formatting inputs...")
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
        sources = [
            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]
        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

        logger.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        # pad with IGNORE INDEX as well till the end
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        # return dict with attension mask for padded input values
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )



In [17]:
#Wrap model to avoid accelerate issues 
class WrappedModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *args, **kwargs):
        return self.model.forward(*args, **kwargs)

    def train(self):
        self.model.train()

    def eval(self):
        self.model.eval()

    def parameters(self):
        return self.model.parameters()

In [18]:
data = load_dataset("IlyaGusev/ru_turbo_alpaca")["train"]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [19]:
train_dataset = SupervisedDataset(tokenizer=tokenizer, data=data)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

Loading data...
Formatting inputs...
Tokenizing inputs... This may take some time...


In [20]:
from trl import SFTTrainer

grad_acc   = 2
logging_st = 20
lr         = 3e-4 
batch_size = 1
n_epochs   = 2

training_args = transformers.TrainingArguments(
    output_dir='./models/2bit llama instruct/',	
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_acc,
    learning_rate=lr,
    num_train_epochs=n_epochs,
    remove_unused_columns=False,
    logging_strategy="steps",
    logging_steps=logging_st, 
    optim="adafactor",
    max_grad_norm=2.0,
    save_steps=10000,
    lr_scheduler_type="cosine", 
)


trainer = SFTTrainer(
    model=WrappedModel(model),
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=None,
    args=training_args,
    data_collator=data_collator,
    max_seq_length=max_tokens,
    packing=True
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
model.train()
with torch.autocast("cuda"):
    trainer.train()

Step,Training Loss
20,1.6864
40,1.7955
60,1.7853
80,1.6652
100,2.0222
120,1.6006
140,1.5562
160,1.7582
180,1.7106
200,1.6936


Exception in thread Thread-9:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/local/.local/lib/python3.10/site-packages/tensorboard/summary/writer/event_file_writer.py", line 233, in run
    self._record_writer.write(data)
  File "/home/local/.local/lib/python3.10/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write
    self._writer.write(header + header_crc + data + footer_crc)
  File "/home/local/.local/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 766, in write
    self.fs.append(self.filename, file_content, self.binary_mode)
  File "/home/local/.local/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 160, in append
    self._write(filename, file_content, "ab" if binary_mode else "a")
  File "/home/local/.local/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 164, in 

In [22]:
PeftUtils.save_lora_weights(model, filename="./models/lora/instruct_full", base_class=LlamaHQQ)

100%|██████████| 32/32 [00:00<00:00, 6580.59it/s]


In [23]:
model.eval()

#Convert lora weights to the same model dtype for faster inference
PeftUtils.cast_lora_weights(model, dtype=torch.half)


100%|██████████| 32/32 [00:00<00:00, 943.02it/s]


In [24]:
PeftUtils.save_lora_weights(model, filename="./models/lora/instruct_half", base_class=LlamaHQQ)

100%|██████████| 32/32 [00:00<00:00, 6909.89it/s]


In [25]:
cleanup()

### Eval

In [6]:
cache_path = './models'

model_id  = "meta-llama/Llama-2-7b-hf" 
tokenizer = AutoTokenizer.from_pretrained(model_id) 

save_dir = "models/quantized/llama-2-7b-2bit_8-group"
model = LlamaHQQ.from_quantized(save_dir_or_hub=save_dir)

100%|██████████| 32/32 [00:00<00:00, 756.25it/s]
100%|██████████| 32/32 [00:01<00:00, 26.33it/s]


In [7]:
PeftUtils.load_lora_weights(model, filename="./models/lora/instruct_full",)

100%|██████████| 32/32 [00:00<00:00, 123.58it/s]
100%|██████████| 32/32 [00:00<00:00, 768.95it/s]


In [23]:
instruct = "Write a recipe, how would you do an omlette."
text = ""


In [24]:
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Write a recipe, how would you do an omlette.\n',
 'Write a recipe, how would you do an omlette.\n',
 'Write a recipe, how would you do an omlette.\n']

In [25]:
instruct = "Write a recipe, how would you do an omlette for the dark lord."
text = ""


In [26]:
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Write a recipe, how would you do an omlette for the dark lord.\n',
 'Write a recipe, how would you do an omlette for the dark lord.\n',
 'Write a recipe, how would you do an omlette for the dark lord.\n']

In [8]:
# в принципе лама обучающем датасете Ламы были и русскоязычные примеры, так что можно проверить её способности
instruct = "Напиши рецепт карри в китайском стиле с морепродуктами."
text = ""


In [22]:
# пока выглядит не очень -- в особенности с переходом на латиницу 
predict_for_instruction(instruct, text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Напиши рецепт карри в китайском стиле с морепродуктами.\n\n\n\n\n',
 'Напиши рецепт карри в китайском стиле с морепродуктами.\n\n\n\n\n',
 'Напиши рецепт карри в китайском стиле с морепродуктами.\n\n\n\n\n']

In [21]:
# пока выглядит не очень -- в особенности с переходом на латиницу 
predict_for_instruction(PROMPT_DICT["prompt_no_input"].format(instruction=instruct), text, model)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['### Инструкция:\nНапиши рецепт карри в китайском стиле с морепродуктами.\n\n### Ответ: \n\n the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 '### Инструкция:\nНапиши рецепт карри в китайском стиле с морепродуктами.\n\n### Ответ: \n\n the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 '### Инструкция:\nНапиши рецепт карри в китайском стиле с морепродуктами.\n\n### Ответ: \n\n the\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']

In [None]:
# LORA model ppl
eval_wikitext2(model, tokenizer, verbose=True) 

### Да, получилось плоховато -- модель скорее развалилась, чем доучилось на новую задачу. С другой стороны, мест, где что-то могло пойти не так тут довольно много. Во всяком случае это было довольно интересено.

### Playing with weight matrix visualization (not connected to the task)

In [38]:
meta = model.model.layers[1].mlp.gate_proj.meta

In [45]:
unpack_1bit_u8(model.model.layers[1].mlp.gate_proj.W_q).reshape(meta["shape"])

tensor([[1, 0, 1,  ..., 0, 1, 1],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 0, 1,  ..., 1, 0, 0],
        ...,
        [0, 1, 0,  ..., 0, 1, 1],
        [1, 0, 0,  ..., 1, 1, 1],
        [1, 0, 1,  ..., 0, 0, 1]], device='cuda:0', dtype=torch.uint8)

In [63]:
def pack_2bit_u8(W_q: torch.Tensor) -> torch.Tensor:  # uint8 > uint8/4
    W_q = W_q.to(uint8)
    _step = int(len(W_q) / 4)
    print(_step)

    return (
        W_q[:_step] << 6
        | W_q[_step : 2 * _step] << 4
        | W_q[2 * _step : 3 * _step] << 2
        | W_q[3 * _step :]
    )

In [66]:
def pack_1bit_u8(W_q: Tensor) -> Tensor:
    W_q = W_q.to(uint8)
    _step = int(len(W_q) / 8)

    return (
        W_q[:_step] << 7
        | W_q[1 * _step : 2 * _step] << 6
        | W_q[2 * _step : 3 * _step] << 5
        | W_q[3 * _step : 4 * _step] << 4
        | W_q[4 * _step : 5 * _step] << 3
        | W_q[5 * _step : 6 * _step] << 2
        | W_q[6 * _step : 7 * _step] << 1
        | W_q[7 * _step : 8 * _step]
    )

In [20]:
def unpack_1bit_u8(W_q: Tensor, dtype=uint8) -> Tensor:
        _step = W_q.shape[0]
        tmp = torch.empty([8 * _step, W_q.shape[1]], dtype=dtype, device=W_q.device)

        tmp[0 * _step : 1 * _step] = (W_q & 0b10000000) >> 7
        tmp[1 * _step : 2 * _step] = (W_q & 0b01000000) >> 6
        tmp[2 * _step : 3 * _step] = (W_q & 0b00100000) >> 5
        tmp[3 * _step : 4 * _step] = (W_q & 0b00010000) >> 4
        tmp[4 * _step : 5 * _step] = (W_q & 0b00001000) >> 3
        tmp[5 * _step : 6 * _step] = (W_q & 0b00000100) >> 2
        tmp[6 * _step : 7 * _step] = (W_q & 0b00000010) >> 1
        tmp[7 * _step : 8 * _step] = W_q & 0b00000001

        return tmp

In [68]:
list(model.model.layers[0].self_attn.q_proj.parameters())[0].shape

torch.Size([16, 262144])

In [None]:
def compute_perplexity_batched(model, tokenizer, predictions, encodings=None, batch_size=1, add_start_token=True, device='cuda', max_length=None):
    if tokenizer.pad_token is None and batch_size > 1:
        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
        # check that the model already has at least one special token defined
        assert (len(existing_special_tokens) > 0), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
        # assign one of the special tokens to also be the pad token
        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

    if add_start_token and max_length:
        # leave room for <BOS> token to be added:
        assert (tokenizer.bos_token is not None), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
        max_tokenized_len = max_length - 1
    else:
        max_tokenized_len = max_length


    if(encodings is None):
        encodings = tokenizer(
            predictions,
            add_special_tokens=False,
            padding=True,
            truncation=True if max_tokenized_len else False,
            max_length=max_tokenized_len,
            return_tensors="pt",
            return_attention_mask=True).to(device)

    encoded_texts = encodings["input_ids"]
    attn_masks    = encodings["attention_mask"]

    # check that each input is long enough:
    if add_start_token:
        assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
    else:
        assert torch.all(
            torch.ge(attn_masks.sum(1), 2)
        ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

    ppls = []
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

    for start_index in tqdm(range(0, len(encoded_texts), batch_size)):
        end_index     = min(start_index + batch_size, len(encoded_texts))
        encoded_batch = encoded_texts[start_index:end_index]
        attn_mask     = attn_masks[start_index:end_index]

        if add_start_token:
            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
            encoded_batch     = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
            attn_mask         = torch.cat([torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1)

        labels = encoded_batch

        with torch.no_grad():
            out_logits = model(encoded_batch, attention_mask=attn_mask).logits

        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
            / shift_attention_mask_batch.sum(1))

        ppls += perplexity_batch.tolist()

    return np.mean(ppls)

print('perplexity', compute_perplexity_batched(model=model, tokenizer=tokenizer, predictions=[s['text'] for s in dataset_val], batch_size=1, max_length=max_tokens))