In [11]:
from argparse import ArgumentParser
import logging
import math
import os
import random
import shutil
from pathlib import Path

from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

import datasets
import torch
import torch.nn as nn
import torch.nn.functional as F

from functools import partial
from accelerate import Accelerator
from accelerate.checkpointing import save_accelerator_state
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# import sys 
# sys.path.append("/home/LLM_compression/transformers_modified/src")

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    LlamaTokenizerFast,
    Trainer,
    DataCollatorForSeq2Seq,
    TrainingArguments
)
from peft import (
    get_peft_model,
    TaskType,
    LoraConfig
)



# from quant_utils import get_fp_llama, make_layer_bits, prepare_llama_quant

IGNORE_INDEX = -100


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
            )
        },
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=False,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    token: str = field(
        default=None,
        metadata={
            "help": (
                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
                "execute code present on the Hub on your local machine."
            )
        },
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
                "dtype will be automatically derived from the model's weights."
            ),
            "choices": ["auto", "bfloat16", "float16", "float32"],
        },
    )
    max_memory: int = field(
        default=21,
        metadata={"help": "Free memory per gpu."}
    )
    lora_init: bool = field(
        default=False,
        metadata={"help": "True: Use zero and gaussian initialization; False: Load adapters from LoftQ in HF hub."},
    )
    rank: int = field(
        default=64,
        metadata={"help": "Rank of LoRA adapters. LoftQ does not require this config. Used for fp16 LoRA or QLoRA."},
    )
    lora_alpha: int = field(
        default=16,
        metadata={"help": "LoftQ does not require this config. Used for QLoRA."},
    )
    quant_noise_config: dict = field(
        default=None,
        metadata={"help": "Parameters to add noise"},
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom dataset defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
                "execute code present on the Hub on your local machine."
            )
        },
    )
    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
    max_seq_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Optional input sequence length after tokenization. "
                "The training dataset will be truncated in block of this size for training. "
                "Default to the model max input length for single sentence inputs (take into account special tokens)."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    dataset_percentage: Optional[int] = field(
        default=100,
        metadata={
            "help": "The percentage of the dataset used for computation"
        },  
    )
    validation_split_percentage: Optional[int] = field(
        default=5,
        metadata={
            "help": "The percentage of the train set used as validation set in case there's no validation split"
        },
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )


def encode_with_prompt_completion_format(example, tokenizer, max_seq_length):
    '''
    Here we assume each example has 'prompt' and 'completion' fields.
    We concatenate prompt and completion and tokenize them together because otherwise prompt will be padded/trancated 
    and it doesn't make sense to follow directly with the completion.
    '''
    # if prompt doesn't end with space and completion doesn't start with space, add space
    if not example['prompt'].endswith((' ', '\n', '\t')) and not example['completion'].startswith((' ', '\n', '\t')):
        example_text = example['prompt'] + ' ' + example['completion']
    else:
        example_text = example['prompt'] + example['completion']
    example_text = example_text + tokenizer.eos_token
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()
    tokenized_prompt = tokenizer(example['prompt'], return_tensors='pt', max_length=max_seq_length, truncation=True)
    # mask the prompt part for avoiding loss
    labels[:, :tokenized_prompt.input_ids.shape[1]] = -100
    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def encode_with_messages_format(example, tokenizer, max_seq_length):
    '''
    Here we assume each example has a 'messages' field Each message is a dict with 'role' and 'content' fields.
    We concatenate all messages with the roles as delimiters and tokenize them together.
    '''
    messages = example['messages']
    if len(messages) == 0:
        raise ValueError('messages field is empty.')
    
    def _concat_messages(messages):
        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + tokenizer.eos_token + "\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text
        
    example_text = _concat_messages(messages).strip()
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()

    # mask the non-assistant part for avoiding loss
    for message_idx, message in enumerate(messages):
        if message["role"] != "assistant":
            if message_idx == 0:
                message_start_idx = 0
            else:
                message_start_idx = tokenizer(
                    _concat_messages(messages[:message_idx]), return_tensors='pt', max_length=max_seq_length, truncation=True
                ).input_ids.shape[1]
            if message_idx < len(messages) - 1 and messages[message_idx+1]["role"] == "assistant":
                # here we also ignore the role of the assistant
                messages_so_far = _concat_messages(messages[:message_idx+1]) + "<|assistant|>\n"
            else:
                messages_so_far = _concat_messages(messages[:message_idx+1])
            message_end_idx = tokenizer(
                messages_so_far,
                return_tensors='pt', 
                max_length=max_seq_length, 
                truncation=True
            ).input_ids.shape[1]
            labels[:, message_start_idx:message_end_idx] = -100
            
            if message_end_idx >= max_seq_length:
                break

    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def load_hf_datasets(
    data_args
):
    # Load the dataset
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            streaming=data_args.streaming,
            trust_remote_code=data_args.trust_remote_code
        )

        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                streaming=data_args.streaming,
                trust_remote_code=data_args.trust_remote_code
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                streaming=data_args.streaming,
                trust_remote_code=data_args.trust_remote_code
            )
        
        if data_args.dataset_percentage < 100:
            dataset_frac = data_args.dataset_percentage/100
            dataset_parts = raw_datasets['train'].train_test_split(train_size=dataset_frac)
            raw_datasets['train'] = dataset_parts['train']
            dataset_parts = raw_datasets['validation'].train_test_split(test_size=dataset_frac)
            raw_datasets['validation'] = dataset_parts['test']

        return raw_datasets

def read_config(conf_path, func_name: str):
    if isinstance(conf_path, str):
        conf_path = Path(conf_path)

    source = conf_path.read_text()
    bytecode = compile(source, conf_path.as_posix(), "exec")
    namespace = {
        "__file__": conf_path.as_posix(),
    }
    exec(bytecode, namespace)
    return namespace[func_name]()  # type: ignore

In [1]:
2 ** 15

32768

In [9]:
8 * 4096

32768

0.00732421875

In [13]:
16/32

0.5

In [12]:
path_to_scale = "/home/LLaMA/huggingface/act_scales/tulu-2-7b-hf.pt"
act_scale = torch.load(path_to_scale)

path_to_scale = "/home/LLaMA/huggingface/act_scales/tulu-2-7b_activation_estimator.pt"
weight_scale = torch.load(path_to_scale)

In [13]:
act_scale

{'model.layers.0.self_attn.q_proj': tensor([0.1214, 0.0434, 0.0055,  ..., 0.0472, 0.0431, 0.0235]),
 'model.layers.0.self_attn.k_proj': tensor([0.1214, 0.0434, 0.0055,  ..., 0.0472, 0.0431, 0.0235]),
 'model.layers.0.self_attn.v_proj': tensor([0.1214, 0.0434, 0.0055,  ..., 0.0472, 0.0431, 0.0235]),
 'model.layers.0.self_attn.o_proj': tensor([0.0196, 0.0320, 0.0198,  ..., 0.0170, 0.0149, 0.0141]),
 'model.layers.0.mlp.gate_proj': tensor([0.1901, 0.1752, 0.1669,  ..., 0.1907, 0.1881, 0.1860]),
 'model.layers.0.mlp.up_proj': tensor([0.1901, 0.1752, 0.1669,  ..., 0.1907, 0.1881, 0.1860]),
 'model.layers.0.mlp.down_proj': tensor([0.1418, 0.0780, 0.2042,  ..., 0.2161, 0.0810, 0.4729]),
 'model.layers.1.self_attn.q_proj': tensor([0.3889, 0.3762, 0.3972,  ..., 0.2366, 0.3108, 0.2749]),
 'model.layers.1.self_attn.k_proj': tensor([0.3889, 0.3762, 0.3972,  ..., 0.2366, 0.3108, 0.2749]),
 'model.layers.1.self_attn.v_proj': tensor([0.3889, 0.3762, 0.3972,  ..., 0.2366, 0.3108, 0.2749]),
 'model.lay

In [14]:
weight_scale

{'model.layers.0.self_attn.k_proj': tensor([0.1216, 0.0435, 0.0055,  ..., 0.0471, 0.0430, 0.0234]),
 'model.layers.0.self_attn.v_proj': tensor([0.1216, 0.0435, 0.0055,  ..., 0.0471, 0.0430, 0.0234]),
 'model.layers.0.self_attn.q_proj': tensor([0.1216, 0.0435, 0.0055,  ..., 0.0471, 0.0430, 0.0234]),
 'model.layers.0.self_attn.o_proj': tensor([0.0194, 0.0315, 0.0198,  ..., 0.0155, 0.0145, 0.0131]),
 'model.layers.0.mlp.up_proj': tensor([0.1709, 0.1758, 0.1670,  ..., 0.1885, 0.1875, 0.1855]),
 'model.layers.0.mlp.gate_proj': tensor([0.1709, 0.1758, 0.1670,  ..., 0.1885, 0.1875, 0.1855]),
 'model.layers.0.mlp.down_proj': tensor([0.1416, 0.0781, 0.2041,  ..., 0.1982, 0.0811, 0.4746]),
 'model.layers.1.self_attn.k_proj': tensor([0.3867, 0.3770, 0.3984,  ..., 0.2246, 0.3105, 0.2754]),
 'model.layers.1.self_attn.v_proj': tensor([0.3867, 0.3770, 0.3984,  ..., 0.2246, 0.3105, 0.2754]),
 'model.layers.1.self_attn.q_proj': tensor([0.3867, 0.3770, 0.3984,  ..., 0.2246, 0.3105, 0.2754]),
 'model.lay

In [15]:
def get_layer_scale(scale, n_outliers):
    layer_scale = scale.sort()[1][-n_outliers:]
    # layer_scale = layer_scale.sort()[0]
    layer_scale = set(layer_scale.numpy())
    return layer_scale

In [16]:
layer_names = list(act_scale.keys())
n_outliers = 128
n_diff = {}

In [17]:
for name in layer_names:

    if name == 'lm_head':
        continue

    scale = act_scale[name]
    layer_act_scale = get_layer_scale(scale, n_outliers)

    scale = weight_scale[name]
    layer_weight_scale = get_layer_scale(scale, n_outliers)

    n_diff[name] = len(layer_act_scale.difference(layer_weight_scale))

In [18]:
sum([128 for t in act_scale.values()])

28800

In [19]:
sum(n_diff.values())

4765

In [10]:
import torch
x = torch.ones((4096, 4096))

In [4]:
x.t()

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [None]:
optimum-cli export onnx --task 'default' --model /home/Quantization/weights_study/weights/llama_quik3bit_transformers_sft_mixture_only_fp_quant_split_weight_v1/checkpoint-600 /home/onnx/llama7b_quik_3bit

In [None]:
optimum-cli export onnx --task 'default' --device 'cuda' --dtype 'fp16' --model /home/Quantization/weights_study/weights/llama_quik3bit_transformers_sft_mixture_only_fp_quant_split_weight_v1/checkpoint-600 /home/onnx/llama7b_quik_3bit

In [4]:
for name, param in model.named_parameters():
    param.requires_grad = Falsea

In [5]:
if config['QuantizedLinear']['replace']:
    outliers_config= config['outliers']
    outlier_ids = get_fp_llama(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num']
    )
    model.replace_Linear(
        outlier_ids=outlier_ids,
        training_mode=config['QuantizedLinear']['training_mode'] 
    )

In [6]:
if config['BitNoiseQuant']['add_quant_noise']:
    noise_config = config['BitNoiseQuant']
    outliers_config= config['outliers']
    outlier_ids, layer_bit = prepare_llama_quant(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num'], 
        **noise_config['layer_bits']
    )
    model.add_quant_bitnoise_to_weight( 
        layer_bit=layer_bit,
        compute_scale=noise_config['compute_scale'],
        learnable_scale=noise_config['learnable_scale'],
        quant_noise_predict=noise_config['predict']
    )

In [1]:
model.model.layers[0].self_attn.q_proj

NameError: name 'model' is not defined

In [3]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [5]:
model.model.layers[31].self_attn.q_proj.int_weight

AttributeError: 'Linear' object has no attribute 'int_weight'

In [9]:
model.model.layers[0].self_attn.q_proj.mask

device(type='cpu')

In [3]:
if config['QuantizedLinear']['replace']:
    outliers_config= config['outliers']
    outlier_ids = get_fp_llama(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num']
    )
    model.replace_Linear(
        outlier_ids=outlier_ids,
        training_mode=config['QuantizedLinear']['training_mode'] 
    )

In [5]:
if config['loading_quik_quant_weight']['load_weight']:
    path_to_params = config['loading_quik_quant_weight']['path_to_quant_params']
    learnable_scale = config['loading_quik_quant_weight']['learnable_scale']
    quant_params = torch.load(path_to_params)

    model.add_quant_weight(quant_params, learnable_scale)

In [7]:
model.save_pretrained("/home/Quantization/weights_study/weights/llama7b_mixed_weight")
tokenizer.save_pretrained("/home/Quantization/weights_study/weights/llama7b_mixed_weight")

('/home/Quantization/weights_study/weights/llama7b_mixed_weight/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama7b_mixed_weight/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama7b_mixed_weight/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama7b_mixed_weight/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama7b_mixed_weight/tokenizer.json')

In [8]:
model.state_dict()

OrderedDict([('model.embed_tokens.weight',
              tensor([[ 1.2517e-06, -1.7881e-06, -4.3511e-06,  ...,  8.9407e-07,
                       -6.5565e-06,  8.9407e-07],
                      [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
                        2.5787e-03, -3.9368e-03],
                      [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
                        7.7057e-04, -5.0049e-03],
                      ...,
                      [-1.3977e-02, -2.7313e-03, -1.9897e-02,  ..., -1.0437e-02,
                        9.5825e-03, -1.8005e-03],
                      [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
                       -1.6357e-02,  3.3875e-03],
                      [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
                       -1.2939e-02,  3.1948e-05]], dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.q_proj.int_weight',
              tensor([[-1., -2., -0.,  ...,  1.,  0., -1.],

In [9]:
model.model.layers[0].self_attn.q_proj.quantizer.alpha_scale

Parameter containing:
tensor([[0.0186],
        [0.0312],
        [0.0310],
        ...,
        [0.0388],
        [0.0449],
        [0.0275]], dtype=torch.bfloat16)

In [18]:
model.model.layers[0].self_attn.q_proj.fp_weight.shape

torch.Size([4096, 128])

In [8]:
alpha_trained = model.model.layers[0].self_attn.q_proj.quantizer.alpha_scale.data

In [None]:
optimum-cli export onnx --task text-generation --model /home/Quantization/weights_study/weights/llama7b_3bit_loaded ./onnx_model/

In [3]:
w = model.model.layers[0].self_attn.q_proj.weight.data
alpha = model.model.layers[0].self_attn.q_proj.quantizer.alpha_scale
bit = model.model.layers[0].self_attn.q_proj.quantizer.bit
mask = model.model.layers[0].self_attn.q_proj.quantizer.mask
inv_col_perm = model.model.layers[0].self_attn.q_proj.inv_col_perm
qmax = 2**(bit-1) - 1
scale = alpha / qmax

In [4]:
model.model.layers[0].self_attn.q_proj.is_quant_weight

False

In [4]:
int_weight = w[:, mask]
fp_weight = w[:, ~mask].detach()

In [5]:
w_dq = model.model.layers[0].self_attn.q_proj.quantizer(int_weight)

In [11]:
w_dq.shape

torch.Size([4096, 3968])

In [9]:
fp_weight.shape

torch.Size([4096, 128])

In [13]:
w_out = torch.hstack([w_dq, fp_weight])

In [14]:
w_out = w_out[:, inv_col_perm]

In [18]:
w_out[:, 3190]

tensor([-0.0349, -0.0059,  0.0396,  ..., -0.0160,  0.1279, -0.0388],
       dtype=torch.bfloat16)

In [9]:
w

tensor([[-1., -2., -0.,  ...,  1.,  0., -1.],
        [ 2., -0.,  0.,  ..., -1., -1.,  1.],
        [-1.,  1.,  0.,  ...,  1.,  2., -0.],
        ...,
        [-0.,  1., -0.,  ...,  1., -2.,  1.],
        [ 2.,  1.,  0.,  ..., -2., -1., -1.],
        [-1., -1.,  0.,  ...,  2.,  2., -1.]], dtype=torch.bfloat16)

In [6]:
w[:, 3190]

tensor([-0.0349, -0.0059,  0.0396,  ..., -0.0160,  0.1279, -0.0388],
       dtype=torch.bfloat16)

In [7]:
w[:, ~mask]

tensor([[ 0.0124,  0.0267,  0.0204,  ...,  0.0062,  0.0303,  0.0591],
        [-0.1455, -0.0396, -0.0713,  ..., -0.0376,  0.0079,  0.0317],
        [ 0.2500,  0.0452,  0.0439,  ...,  0.1216, -0.0444, -0.0610],
        ...,
        [-0.0344, -0.0242, -0.0698,  ...,  0.0124, -0.0294,  0.0439],
        [-0.0028,  0.0023, -0.0496,  ...,  0.0164, -0.0146, -0.0601],
        [-0.0106,  0.0120, -0.0114,  ...,  0.0282,  0.0366,  0.0120]],
       dtype=torch.bfloat16)

In [12]:
w[:, 3190]

tensor([ 0.0098, -0.0417,  0.0391,  ...,  0.0025, -0.0674,  0.0315],
       dtype=torch.bfloat16)

In [25]:
w[:, 3190]

tensor([ 0.0098, -0.0417,  0.0391,  ...,  0.0025, -0.0674,  0.0315],
       dtype=torch.bfloat16)

In [10]:
alpha / qmax * w

tensor([[-0.0062, -0.0124, -0.0000,  ...,  0.0062,  0.0000, -0.0062],
        [ 0.0209, -0.0000,  0.0000,  ..., -0.0104, -0.0104,  0.0104],
        [-0.0103,  0.0103,  0.0000,  ...,  0.0103,  0.0206, -0.0000],
        ...,
        [-0.0000,  0.0129, -0.0000,  ...,  0.0129, -0.0259,  0.0129],
        [ 0.0299,  0.0150,  0.0000,  ..., -0.0299, -0.0150, -0.0150],
        [-0.0092, -0.0092,  0.0000,  ...,  0.0183,  0.0183, -0.0092]],
       dtype=torch.bfloat16)

In [16]:
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([ 0.0098, -0.0417,  0.0391,  ...,  0.0025, -0.0674,  0.0315],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [32]:
fp_indices = quant_params['model.layers.0.self_attn.q_proj']['fp_indices']

In [38]:
fp_indices

tensor([3190,  588, 2518, 2993, 1808, 2750, 4015, 2077, 2522, 2820, 2469, 2641,
        3601,  760, 2958, 3275, 1375, 1218, 3213, 1391, 2196,  447, 2637, 2350,
        1346,  575, 2580, 2314, 1791, 1553, 2050, 2092,  363,  642,  326, 3656,
        1571, 3306, 1159, 2158, 3842, 2235,  462, 3492, 3946, 1404, 3863,   94,
         613, 1261, 2298, 3135,  310, 2608,  289, 2789, 2403, 2028, 2147, 4076,
        1076,  490, 1512, 2593, 3431, 4031, 3209,  339, 2232, 3953,  959,  210,
         391, 2744,  125, 3222, 2944, 1456, 2866, 1544,  580, 2914, 3893, 2533,
        3797,  597, 2393, 1710, 3045, 2778, 1626, 2136, 3729, 2853, 3877, 3550,
        2317, 3933, 1813, 3920,  257,  102, 1996,  288,  577, 1788,  473, 2704,
        3915, 1995, 1415, 3238, 3078, 2130, 4071, 2927, 1622, 3164, 1411, 1110,
        3178, 2543,   22, 3443, 4030, 3964, 1744, 4051])

In [42]:
fp_indices[122]

tensor(22)

In [41]:
fp_indices.sort()[0]

tensor([  22,   94,  102,  125,  210,  257,  288,  289,  310,  326,  339,  363,
         391,  447,  462,  473,  490,  575,  577,  580,  588,  597,  613,  642,
         760,  959, 1076, 1110, 1159, 1218, 1261, 1346, 1375, 1391, 1404, 1411,
        1415, 1456, 1512, 1544, 1553, 1571, 1622, 1626, 1710, 1744, 1788, 1791,
        1808, 1813, 1995, 1996, 2028, 2050, 2077, 2092, 2130, 2136, 2147, 2158,
        2196, 2232, 2235, 2298, 2314, 2317, 2350, 2393, 2403, 2469, 2518, 2522,
        2533, 2543, 2580, 2593, 2608, 2637, 2641, 2704, 2744, 2750, 2778, 2789,
        2820, 2853, 2866, 2914, 2927, 2944, 2958, 2993, 3045, 3078, 3135, 3164,
        3178, 3190, 3209, 3213, 3222, 3238, 3275, 3306, 3431, 3443, 3492, 3550,
        3601, 3656, 3729, 3797, 3842, 3863, 3877, 3893, 3915, 3920, 3933, 3946,
        3953, 3964, 4015, 4030, 4031, 4051, 4071, 4076])

In [44]:
fp_indices.sort()[1]

tensor([122,  47, 101,  74,  71, 100, 103,  54,  52,  34,  67,  32,  72,  21,
         42, 106,  61,  25, 104,  80,   1,  85,  48,  33,  13,  70,  60, 119,
         38,  17,  49,  24,  16,  19,  45, 118, 110,  77,  62,  79,  29,  36,
        116,  90,  87, 126, 105,  28,   4,  98, 109, 102,  57,  30,   7,  31,
        113,  91,  58,  39,  20,  68,  41,  50,  27,  96,  23,  86,  56,  10,
          2,   8,  83, 121,  26,  63,  53,  22,  11, 107,  73,   5,  89,  55,
          9,  93,  78,  81, 115,  76,  14,   3,  88, 112,  51, 117, 120,   0,
         66,  18,  75, 111,  15,  37,  64, 123,  43,  95,  12,  35,  92,  84,
         40,  46,  94,  82, 108,  99,  97,  44,  69, 125,   6, 124,  65, 127,
        114,  59])

In [47]:
quant_params['model.layers.0.self_attn.q_proj']['fp_weight'][:, fp_indices.sort()[1]]

tensor([[ 0.0124,  0.0267,  0.0204,  ...,  0.0062,  0.0303,  0.0591],
        [-0.1458, -0.0396, -0.0711,  ..., -0.0377,  0.0079,  0.0318],
        [ 0.2510,  0.0451,  0.0439,  ...,  0.1216, -0.0443, -0.0609],
        ...,
        [-0.0345, -0.0242, -0.0696,  ...,  0.0124, -0.0294,  0.0439],
        [-0.0028,  0.0024, -0.0497,  ...,  0.0164, -0.0146, -0.0600],
        [-0.0106,  0.0120, -0.0113,  ...,  0.0282,  0.0365,  0.0120]],
       dtype=torch.float16)

In [8]:
l = nn.Linear(2, 3)
l.weight.device

device(type='cpu')

In [4]:
if config['QuantizedLinear']['replace']:
    outliers_config= config['outliers']
    outlier_ids = get_fp_llama(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num']
    )
    model.replace_Linear(
        outlier_ids=outlier_ids,
        training_mode=config['QuantizedLinear']['training_mode'] 
    )

AttributeError: 'Linear' object has no attribute 'device'

In [5]:
if config['loading_quik_quant_weight']['load_weight']:
    path_to_params = config['loading_quik_quant_weight']['path_to_quant_params']
    learnable_scale = config['loading_quik_quant_weight']['learnable_scale']
    quant_params = torch.load(path_to_params)

    model.add_quant_weight(quant_params, learnable_scale)

In [19]:
outlier_ids = quant_params['model.layers.0.self_attn.q_proj']['fp_indices']
fp_weight = quant_params['model.layers.0.self_attn.q_proj']['fp_weight']

In [25]:
fp_weight = fp_weight[:, outlier_ids.sort()[1]]

In [24]:
mask = model.model.layers[0].self_attn.q_proj.mask

In [26]:
w[:, ~mask] = fp_weight

In [27]:
w[:, 3190]

tensor([-0.0349, -0.0059,  0.0396,  ..., -0.0160,  0.1276, -0.0387],
       dtype=torch.float16)

In [29]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-1., -2., -0.,  ...,  1.,  0., -1.],
        [ 2., -0.,  0.,  ..., -1., -1.,  1.],
        [-1.,  1.,  0.,  ...,  1.,  2., -0.],
        ...,
        [-0.,  1., -0.,  ...,  1., -2.,  1.],
        [ 2.,  1.,  0.,  ..., -2., -1., -1.],
        [-1., -1.,  0.,  ...,  2.,  2., -1.]], dtype=torch.float16,
       requires_grad=True)

In [7]:
w = model.model.layers[0].self_attn.q_proj.weight.data
alpha = model.model.layers[0].self_attn.q_proj.quantizer.alpha_scale
bit = model.model.layers[0].self_attn.q_proj.quantizer.bit
qmax = 2**(bit-1) - 1

In [8]:
alpha / qmax * w

tensor([[-0.0062, -0.0124, -0.0000,  ...,  0.0062,  0.0000, -0.0062],
        [ 0.0209, -0.0000,  0.0000,  ..., -0.0104, -0.0104,  0.0104],
        [-0.0103,  0.0103,  0.0000,  ...,  0.0103,  0.0207, -0.0000],
        ...,
        [-0.0000,  0.0129, -0.0000,  ...,  0.0129, -0.0258,  0.0129],
        [ 0.0300,  0.0150,  0.0000,  ..., -0.0300, -0.0150, -0.0150],
        [-0.0092, -0.0092,  0.0000,  ...,  0.0183,  0.0183, -0.0092]])

In [9]:
w[:, 3190]

tensor([-0.0349, -0.0059,  0.0396,  ..., -0.0160,  0.1276, -0.0387],
       dtype=torch.float16)

In [3]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-1., -2., -0.,  ...,  1.,  0., -1.],
        [ 2., -0.,  0.,  ..., -1., -1.,  1.],
        [-1.,  1.,  0.,  ...,  1.,  2., -0.],
        ...,
        [-0.,  1., -0.,  ...,  1., -2.,  1.],
        [ 2.,  1.,  0.,  ..., -2., -1., -1.],
        [-1., -1.,  0.,  ...,  2.,  2., -1.]], dtype=torch.bfloat16,
       requires_grad=True)

In [None]:
/home/Quantization/weights_study/weights/llama7b_3bit_loaded

In [10]:
model.save_pretrained("/home/Quantization/weights_study/weights/llama7b_3bit_loaded")
tokenizer.save_pretrained("/home/Quantization/weights_study/weights/llama7b_3bit_loaded")

('/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer.json')

('/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama7b_3bit_loaded/tokenizer.json')

In [30]:
model.save_pretrained("/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale")
tokenizer.save_pretrained("/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale")

('/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama7b_4bit_trained_scale/tokenizer.json')

In [None]:
if config['loading_quik_quant_weight']['load_weight']:
    path_to_params = config['loading_quik_quant_weight']['path_to_quant_params']
    learnable_scale = config['loading_quik_quant_weight']['learnable_scale']
    quant_params = torch.load(path_to_params)

    model.add_quant_weight(quant_params, learnable_scale)

In [3]:
noise_config = config['NoiseQuant']
outliers_config = config['outliers']
outlier_ids, layer_bit = prepare_llama_quant(
    outliers_config['path_to_act_scales'], 
    outliers_config['fp_features_num'], 
    **noise_config['layer_bits']
)

In [4]:
def get_fp_inds_for_quik(path_to_act_scales, fp_features_num):
    act_scales = torch.load(path_to_act_scales)
    fp_indices_in_lin_layers = {k: torch.sort(v)[1][-fp_features_num:] for k, v in act_scales.items()}
    return fp_indices_in_lin_layers

noise_config = config['NoiseQuant']
outliers_config = config['outliers']

fp_inds_in_lin_layers = get_fp_inds_for_quik(outliers_config['path_to_act_scales'], outliers_config['fp_features_num'])

In [5]:
outlier_fraction = 0.05 

modules_name_dict = {name: module for name, module in model.named_modules()}
for name, module in modules_name_dict.items():
    if isinstance(module, nn.Linear) and (name.find('lm_head') == -1):
        ind = name.rfind(".")
        if ind == -1:
            father = modules_name_dict[""]
        else:
            father = modules_name_dict[name[:ind]]
        print(name)
        fp_indices = fp_inds_in_lin_layers[name]

        weight = module.weight.data
        mask = torch.ones(weight.size(1), dtype=torch.bool)
        mask[fp_indices] = False

 
        with torch.no_grad(): 
            w = weight[:, mask] 
            w_flat = w.view(-1) 
            lower_threshold, upper_threshold = ( 
                torch.kthvalue( 
                    w_flat, 
                    int(w_flat.numel() * outlier_fraction / 2), 
                )[0], 
                torch.kthvalue( 
                    w_flat, 
                    int(w_flat.numel() * (1 - outlier_fraction / 2)), 
                )[0], 
            ) 
        
            outliers = (w < lower_threshold) | (w > upper_threshold) 
        
            outlier_mask = outliers.detach()
            w[outlier_mask] = 0

            module.weight.data[:, mask] = w

model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.self_attn.o_proj
model.layers.2.mlp.gate_proj
model.layers.2.mlp.up_proj
model.layers.2.mlp.down_proj
model.layers.3.self_attn.q_proj
model.layers.3.self_attn.k_proj
model.layers.3.self_attn.v_proj
model.layers.3.self_attn.o_proj
model.layers.3.mlp.gate_proj
model.layers.3.mlp.up_proj
model.layers.3.mlp.down_proj
model.layers.4.self_attn.q_proj
model.layers.4.self_attn.k_proj
model.layers.4.self_attn.v_proj
model.layers.4.self_attn.o_proj
model.layers.4.mlp.g

In [7]:
model.save_pretrained("/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight")

In [8]:
tokenizer.save_pretrained("/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight")

('/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama7b_no_outliers_in_quant_weight/tokenizer.json')

In [21]:
weight[:, mask].max()

tensor(0.2715, dtype=torch.bfloat16)

In [22]:
w_quant = weight[:, mask]

In [31]:
module.weight.data[:, mask].max()

tensor(0.0203, dtype=torch.bfloat16)

In [3]:
quik_dict = torch.load("/home/LLM_compression/QUIK/weights/llama7b_3bit_128fp_quant_scales/quant_params.pt")

In [4]:
fp_indices = quik_dict['model.layers.25.mlp.up_proj']['fp_indices']

In [None]:
self.col_perm = act_scales.sort()[1]
self.inv_col_perm = torch.zeros_like(self.col_perm)
self.inv_col_perm[self.col_perm] = torch.arange(self.col_perm.numel())

In [17]:
fp_indices.sort()[0]

tensor([  16,   23,   93,   94,  149,  257,  264,  282,  310,  339,  363,  386,
         420,  436,  448,  462,  468,  470,  488,  490,  588,  597,  641,  788,
         888,  934,  972, 1214, 1215, 1331, 1335, 1345, 1360, 1363, 1379, 1404,
        1415, 1432, 1465, 1494, 1512, 1571, 1605, 1619, 1678, 1688, 1755, 1763,
        1793, 1825, 1839, 1845, 1916, 1946, 2010, 2016, 2033, 2036, 2050, 2056,
        2084, 2094, 2158, 2168, 2192, 2209, 2215, 2230, 2260, 2281, 2298, 2324,
        2350, 2358, 2360, 2369, 2389, 2393, 2459, 2465, 2469, 2533, 2573, 2611,
        2622, 2647, 2750, 2789, 2852, 2853, 2863, 2883, 2916, 2924, 2927, 2980,
        3002, 3038, 3061, 3173, 3178, 3202, 3208, 3215, 3241, 3391, 3444, 3471,
        3546, 3571, 3651, 3656, 3700, 3766, 3803, 3826, 3839, 3844, 3872, 3952,
        3971, 3997, 4030, 4051, 4053, 4071, 4074, 4076])

In [84]:
quik_dict['model.layers.25.mlp.up_proj']['quant_weight']

device(type='cpu')

In [85]:
weight = quik_dict['model.layers.25.mlp.up_proj']['quant_weight']
alpha_scale = quik_dict['model.layers.25.mlp.up_proj']['alpha'].to('cpu')
qmax = quik_dict['model.layers.25.mlp.up_proj']['maxq'].to('cpu')

In [86]:
w = alpha_scale / qmax * weight

In [92]:
weight.requires_grad_()

tensor([[-3., -1.,  2.,  ...,  1.,  1., -2.],
        [-2.,  0.,  1.,  ..., -1.,  1., -0.],
        [ 2., -0.,  2.,  ..., -1., -1., -0.],
        ...,
        [ 0.,  0.,  3.,  ...,  1., -3., -2.],
        [-2.,  1., -3.,  ..., -3., -2.,  0.],
        [-1.,  0.,  2.,  ...,  2.,  1.,  2.]], dtype=torch.float16,
       requires_grad=True)

In [21]:
weight

NameError: name 'weight' is not defined

In [65]:
weight = quik_dict['model.layers.25.mlp.up_proj']['quant_weight'].data.clone()
mask = torch.ones(weight.size(1), dtype=torch.bool)
mask[fp_indices] = False

In [66]:
col_ids = torch.arange(weight.size(1))
col_perm = torch.cat([col_ids[mask], col_ids[~mask]])
inv_col_perm = torch.zeros(col_perm.numel(), dtype=col_perm.dtype)
inv_col_perm[col_perm] = torch.arange(col_perm.numel())

In [69]:
weight = weight[:, col_perm].clone()

In [73]:
weight = weight[:, inv_col_perm].clone()

In [78]:
(weight != quik_dict['model.layers.25.mlp.up_proj']['quant_weight'].data).sum()

tensor(0)

In [64]:
inv_col_perm

tensor([   0,    1,    2,  ..., 3965, 3966, 3967])

In [9]:
col_perm = fp_indices.sort()[0]

In [24]:
num_cols

tensor(4096)

In [26]:
torch.zeros(4096)

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [27]:
num_cols = .shape[1]

inv_col_perm = torch.zeros(num_cols)
inv_col_perm[col_perm] = torch.arange(num_cols)

RuntimeError: shape mismatch: value tensor of shape [4096] cannot be broadcast to indexing result of shape [128]

In [31]:
col_perm

tensor([  16,   23,   93,   94,  149,  257,  264,  282,  310,  339,  363,  386,
         420,  436,  448,  462,  468,  470,  488,  490,  588,  597,  641,  788,
         888,  934,  972, 1214, 1215, 1331, 1335, 1345, 1360, 1363, 1379, 1404,
        1415, 1432, 1465, 1494, 1512, 1571, 1605, 1619, 1678, 1688, 1755, 1763,
        1793, 1825, 1839, 1845, 1916, 1946, 2010, 2016, 2033, 2036, 2050, 2056,
        2084, 2094, 2158, 2168, 2192, 2209, 2215, 2230, 2260, 2281, 2298, 2324,
        2350, 2358, 2360, 2369, 2389, 2393, 2459, 2465, 2469, 2533, 2573, 2611,
        2622, 2647, 2750, 2789, 2852, 2853, 2863, 2883, 2916, 2924, 2927, 2980,
        3002, 3038, 3061, 3173, 3178, 3202, 3208, 3215, 3241, 3391, 3444, 3471,
        3546, 3571, 3651, 3656, 3700, 3766, 3803, 3826, 3839, 3844, 3872, 3952,
        3971, 3997, 4030, 4051, 4053, 4071, 4074, 4076])

In [30]:
torch.arange(num_cols)

tensor([   0,    1,    2,  ..., 4093, 4094, 4095])

In [29]:
inv_col_perm.shape

torch.Size([4096])

In [22]:
torch.arange(num_cols)

tensor([   0,    1,    2,  ..., 4093, 4094, 4095])

In [None]:
fp_indices = quik_dict['model.layers.25.mlp.up_proj']['fp_indices']
fp_weight = quik_dict['model.layers.25.mlp.up_proj']['fp_weight']
bit = quik_dict['model.layers.25.mlp.up_proj']['bit']
alpha = quik_dict['model.layers.25.mlp.up_proj']['alpha']
maxq = 2 ** (bit -1) - 1 
scale = alpha / maxq

In [None]:
modules_name_dict = {name: module for name, module in model.named_modules()}
        for name, module in modules_name_dict.items():
            if isinstance(module, nn.Linear) and (name.find('lm_head') == -1):
                ind = name.rfind(".")
                if ind == -1:
                    father = modules_name_dict[""]
                else:
                    father = modules_name_dict[name[:ind]]
                print(name)
                fp_cols_inds = fp_inds_in_lin_layers[name]
                qlinear = LinearQuantNoise(
                    module.weight, module.bias, 
                    quant_bit=config['LinearQuantNoise']['quant_bit'], 
                    block_size=config['LinearQuantNoise']['block_size'], 
                    fp_cols_inds=fp_cols_inds, 
                    training_mode=config['LinearQuantNoise']['training_mode'], 
                    add_quant_noise=config['LinearQuantNoise']['add_quant_noise']
                )
                qlinear.get_quant_scales()
                setattr(father, name[ind + 1:], qlinear)

In [6]:
model

NameError: name 'model' is not defined

In [8]:
model.quantize_weight()

In [11]:
model.save_pretrained("/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales")

In [12]:
tokenizer.save_pretrained("/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales")

('/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales/tokenizer_config.json',
 '/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales/special_tokens_map.json',
 '/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales/tokenizer.model',
 '/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales/added_tokens.json',
 '/home/LLM_compression/QUIK/weights/llama7b_2w_16a_128fp_quant_with_trained_scales/tokenizer.json')

In [6]:
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([-0.0354, -0.0039,  0.0415,  ..., -0.0312,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [10]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0117, -0.0233,  0.0000,  ...,  0.0000,  0.0000, -0.0117],
        [ 0.0156, -0.0156,  0.0000,  ..., -0.0156, -0.0156,  0.0000],
        [-0.0203,  0.0101,  0.0000,  ...,  0.0101,  0.0101, -0.0101],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0542,  0.0000],
        [ 0.0216,  0.0000,  0.0000,  ..., -0.0432, -0.0216, -0.0216],
        [-0.0356, -0.0178,  0.0000,  ...,  0.0178,  0.0178, -0.0178]],
       dtype=torch.bfloat16, requires_grad=True)

In [9]:
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([-0.0354, -0.0039,  0.0415,  ..., -0.0312,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [4]:
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([-0.0354, -0.0039,  0.0415,  ..., -0.0312,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [3]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16, requires_grad=True)

In [8]:
model.model.layers[0].self_attn.q_proj.weight

tensor([-0.0354, -0.0092,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [6]:
model.model.layers[0].self_attn.q_proj.quantizer.alpha.shape

torch.Size([4096, 1])

In [3]:
#Optimization
# from transformers.modeling_utils import unwrap_model

model_name  = model._get_name()
if model_name in ['LlamaForCausalLM']:
    pass

decoder_layer = model.model.layers[0]


layers = ['self_attn', 'mlp']
projectors = {
    'self_attn': ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    'mlp': ['up_proj', 'down_proj', 'gate_proj']
}
for layer_name in layers:
    # cur_layer = getattr(self, layer_name)
    cur_layer = getattr(decoder_layer, layer_name)
    for proj_name in projectors[layer_name]:
        cur_projection = getattr(cur_layer, proj_name)

        # if isinstance(cur_projection, QuantizedLinear):
        layer_weight_num = torch.tensor(cur_projection.quantizer.weight_shape).sum()
        break
    break

AttributeError: 'Linear' object has no attribute 'quantizer'

In [4]:
def lsq_forward(w, bit, alpha):    
    qmax = 2 ** (bit.detach() - 1) - 1
    # q = F.hardtanh(w / alpha, -1.0, 1.0) * qmax
    q = F.hardtanh(w / alpha, -1.0, 1.0)
    mask_q_pos = (q > 0)
    q = q * (2 ** (bit.detach() - 1)) - mask_q_pos * q
    out = (q.round() + (q - q.detach())) * (alpha / qmax)
    return out

In [45]:
cur_projection.quantizer.quant_cols_num

AttributeError: 'NoiseQuant' object has no attribute 'quant_cols_num'

In [48]:
bit = cur_projection.quantizer.bit
bit = torch.tensor(bit)
block_size = cur_projection.quantizer.block_size
mask = cur_projection.quantizer.mask
quant_cols_num = 31
w = cur_projection.weight

In [50]:
if mask is not None:
    w_re = w[:, mask]
else:
    w_re = w

if block_size > 0:
    out_features = w_re.shape[0]
    in_features = w_re.shape[1]
    # w_re = w_re.reshape((out_features * block_size, in_features // block_size))
       
    w_re = w_re.reshape((out_features * quant_cols_num, block_size))

In [6]:
# alpha0 = 0.01*torch.ones(out_features, in_features // block_size, dtype=w.dtype)

In [51]:
# alpha0 = nn.Parameter(0.01*torch.ones(out_features, in_features // block_size, dtype=w.dtype))
alpha = nn.Parameter(0.01*torch.ones(out_features * in_features // block_size, 1, dtype=w.dtype))

In [8]:
# alpha = torch.repeat_interleave(alpha0, block_size, dim=0)

In [52]:
# bit = nn.Parameter(torch.zeros(1))
# alpha = nn.Parameter(torch.tensor(0.01))

N_BIN = 256
# bit = 2 + torch.sigmoid(bit)*4
bit = 1.5 + torch.sigmoid(bit)

bit += (torch.rand_like(bit) - 0.5)
bit = bit.round() + (bit - bit.detach())

alpha = F.softplus(alpha, beta=10**(6), threshold=1) 
lsq = lsq_forward(w_re, bit.round(), alpha)

c1 = w_re >= alpha
c2 = w_re <= -alpha     
delta = alpha / (2**(bit - 1) - 1)

with torch.no_grad():                
    diff = (lsq - w_re) / delta #difference between dequantized and original weights after their scale
    sel = diff[torch.logical_not(torch.logical_or(c1, c2))] #take weights less than alpha

    hist = torch.histc(sel, bins=N_BIN, min=-0.5, max=0.5)    

    noise = torch.multinomial(hist, w_re.numel(), True) + torch.rand_like(w_re.view(-1))               
    noise = (noise / N_BIN - 0.5).view(w_re.shape)
    noise = noise.to(w_re.dtype)

w_rand = noise * delta
w_cliped = torch.where(c2, -alpha, w_re + w_rand)
w_cliped = torch.where(c1, alpha, w_cliped)

In [53]:
if mask is not None:
    w_out = torch.zeros(w.shape, dtype=w.dtype, device=w.device)
    w_out[:, mask] = w_cliped.reshape((out_features, in_features))
    w_out[:, ~mask] = w[:, ~mask]

In [146]:
w_out[:, 3190]

tensor([-0.0354, -0.0087,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [133]:
x1 = torch.ones(3, 3)
print(x1)
pad_value = 0
pad_func = nn.ConstantPad1d((0, 1), pad_value)

output_t = pad_func(x1)
output_t

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


tensor([[1., 1., 1., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 0.]])

In [121]:
w_cliped

torch.Size([524288, 31])

In [105]:
w_rand = w_rand.reshape((out_features, in_features))

In [88]:
w_rand_ext = torch.zeros(w.shape, dtype=w.dtype, device=w.device)
w_rand_ext[:, mask] = w_rand

In [96]:
w_cliped1 = torch.where(c2, 0, w )

RuntimeError: The size of tensor a (31) must match the size of tensor b (4096) at non-singleton dimension 1

In [109]:
w_re + w_rand

RuntimeError: The size of tensor a (31) must match the size of tensor b (3968) at non-singleton dimension 1

In [108]:
torch.where(c2, -alpha, w_re + w_rand)

RuntimeError: The size of tensor a (31) must match the size of tensor b (3968) at non-singleton dimension 1

In [98]:
c2.shape

torch.Size([524288, 31])

In [95]:
torch.where(mask, w_rand, 0.0)

RuntimeError: The size of tensor a (4096) must match the size of tensor b (3968) at non-singleton dimension 1

In [93]:
w_rand

tensor([[-0.0004, -0.0036,  0.0023,  ...,  0.0023, -0.0033,  0.0024],
        [ 0.0012,  0.0018, -0.0005,  ...,  0.0031, -0.0007,  0.0005],
        [ 0.0005,  0.0020, -0.0011,  ...,  0.0045, -0.0020, -0.0036],
        ...,
        [ 0.0005, -0.0047,  0.0018,  ...,  0.0013,  0.0009, -0.0010],
        [ 0.0024,  0.0008, -0.0007,  ..., -0.0048,  0.0020,  0.0005],
        [-0.0018, -0.0005, -0.0002,  ...,  0.0013, -0.0023, -0.0033]],
       dtype=torch.bfloat16, grad_fn=<ViewBackward0>)

In [92]:
w_rand_ext.requires_grad

True

In [168]:
torch.sigmoid(torch.tensor(0)) 

tensor(0.5000)

In [172]:
1.5 + torch.sigmoid(torch.tensor(-0.5)) 

tensor(1.8775)

In [152]:
bit

tensor([4.], grad_fn=<AddBackward0>)

In [150]:
torch.where(c2, -alpha, w + noise * delta)

tensor([[-5.7293e-03, -1.0000e-02, -2.6828e-03,  ...,  4.3774e-03,
          2.2817e-03, -3.7865e-03],
        [ 1.3864e-02, -3.9429e-03,  3.4691e-03,  ..., -9.9187e-03,
         -1.0000e-02,  7.2362e-03],
        [-1.0000e-02,  1.2224e-02,  2.3237e-05,  ...,  5.5835e-03,
          1.8989e-02, -2.9551e-03],
        ...,
        [ 1.1344e-03,  1.0702e-02,  2.2955e-04,  ...,  1.0329e-02,
         -1.0000e-02,  1.0352e-02],
        [ 2.5054e-02,  1.0310e-02,  3.3550e-03,  ..., -1.0000e-02,
         -1.0000e-02, -1.0000e-02],
        [-1.0000e-02, -6.1651e-03,  1.6134e-03,  ...,  1.8652e-02,
          1.6144e-02, -8.9261e-03]], grad_fn=<WhereBackward0>)

In [129]:
torch.where(c1, alpha, torch.where(c2, -alpha, data + noise * delta))

In [145]:
torch.rand_like(w.view(-1))

tensor([0.9492, 0.5703, 0.5078,  ..., 0.0938, 0.8867, 0.8672],
       dtype=torch.bfloat16)

In [144]:
torch.multinomial(hist, w.numel(), True) + torch.rand_like(w.view(-1))

tensor([240.0000, 170.0000,   7.1875,  ...,  91.0000, 108.0000, 165.0000],
       dtype=torch.bfloat16)

In [148]:
noise

tensor([[ 0.2617, -0.0547, -0.1953,  ..., -0.1270, -0.4590, -0.3906],
        [-0.4922,  0.2305, -0.0566,  ...,  0.4180, -0.1035,  0.2266],
        [ 0.2891, -0.4648, -0.2461,  ..., -0.2363,  0.0898, -0.4512],
        ...,
        [-0.4375, -0.2637,  0.0781,  ..., -0.4062, -0.3359, -0.1836],
        [-0.1797, -0.3203,  0.1484,  ..., -0.1094,  0.2070,  0.1836],
        [ 0.1133, -0.3320,  0.2383,  ...,  0.2578, -0.2090, -0.4805]],
       dtype=torch.bfloat16)

In [47]:
is_training = True
is_discretize = True
if not is_training or is_discretize :
    bit = bit.round() + (bit - bit.detach())

In [48]:
alpha = F.softplus(alpha)
alpha

tensor([1.3133], grad_fn=<SoftplusBackward0>)

In [4]:
model.model.layers[0].self_attn.q_proj

Linear(in_features=4096, out_features=4096, bias=False)

In [None]:
model.model.layers[0].self_attn.q_proj.quantizer.quant_scale.shape

In [3]:
if config['QuantizedLinear']['replace']:
    outliers_config= config['outliers']
    outlier_ids, layer_bit = prepare_llama_quant(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num']
    )

    model.replace_Linear(
        outlier_ids=outlier_ids,
        training_mode=config['QuantizedLinear']['training_mode'] 
    )

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QuantizedLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QuantizedLinear(in_features=4096, out_features=4096, bias=False)
          (v_proj): QuantizedLinear(in_features=4096, out_features=4096, bias=False)
          (o_proj): QuantizedLinear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): QuantizedLinear(in_features=4096, out_features=11008, bias=False)
          (up_proj): QuantizedLinear(in_features=4096, out_features=11008, bias=False)
          (down_proj): QuantizedLinear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm(

In [14]:
w_train = model.model.layers[0].self_attn.q_proj.weight.clone()

In [15]:
w_train

tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16, grad_fn=<CloneBackward0>)

In [5]:
w_orig = model.model.layers[0].self_attn.q_proj.weight.clone()
w_orig

tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<CloneBackward0>)

In [16]:
w_train[:, 3190]

tensor([-0.0354, -0.0085,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [6]:
w_orig[:, 3190]

tensor([-0.0354, -0.0087,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [3]:
model.quantize_weight()

In [6]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0110, -0.0220,  0.0000,  ...,  0.0000,  0.0000, -0.0089],
        [ 0.0165, -0.0165,  0.0000,  ..., -0.0153, -0.0153,  0.0000],
        [-0.0259,  0.0129,  0.0000,  ...,  0.0121,  0.0121, -0.0121],
        ...,
        [ 0.0000,  0.0206,  0.0000,  ...,  0.0000, -0.0510,  0.0000],
        [ 0.0236,  0.0000,  0.0000,  ..., -0.0369, -0.0369, -0.0184],
        [-0.0300, -0.0150,  0.0000,  ...,  0.0168,  0.0168, -0.0168]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [3]:
alpha_train = model.model.layers[0].self_attn.q_proj.quantizer.alpha
alpha_train

Parameter containing:
tensor([[0.0110],
        [0.0096],
        [0.0069],
        ...,
        [0.0167],
        [0.0138],
        [0.0168]], device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [17]:
alpha_train = model.model.layers[0].self_attn.q_proj.quantizer.alpha
alpha_train

Parameter containing:
tensor([[0.0110],
        [0.0096],
        [0.0069],
        ...,
        [0.0167],
        [0.0138],
        [0.0168]], dtype=torch.bfloat16, requires_grad=True)

In [10]:
alpha_orig = model.model.layers[0].self_attn.q_proj.quantizer.alpha
alpha_orig

Parameter containing:
tensor([[0.0109],
        [0.0095],
        [0.0070],
        ...,
        [0.0167],
        [0.0137],
        [0.0167]], device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [17]:
#after train
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [10]:
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([-0.0354, -0.0087,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [18]:
#after train
model.model.layers[0].self_attn.q_proj.weight[:, 3190]

tensor([-0.0354, -0.0087,  0.0415,  ..., -0.0164,  0.1289, -0.0405],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [5]:
if config['NoiseQuant']['add_quant_noise']:
    noise_config = config['NoiseQuant']
    outliers_config= config['outliers']
    outlier_ids, layer_bit = prepare_llama_quant(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num'], 
        **noise_config['layer_bits']
    )
    model.add_quant_noise_to_weight( 
        layer_bit=layer_bit, 
        block_size=noise_config['block_size'],
        fp_cols_num=outliers_config['fp_features_num'],
        compute_scale=noise_config['compute_scale'], 
        quant_noise_predict=noise_config['predict']
    )

In [5]:
if config['BitNoiseQuant']['add_quant_noise']:
    noise_config = config['BitNoiseQuant']
    outliers_config= config['outliers']
    outlier_ids, layer_bit = prepare_llama_quant(
        outliers_config['path_to_act_scales'], 
        outliers_config['fp_features_num'], 
        **noise_config['layer_bits']
    )
    model.add_quant_bitnoise_to_weight( 
        layer_bit=layer_bit, 
        block_size=noise_config['block_size'],
        fp_cols_num=outliers_config['fp_features_num'],
        compute_scale=noise_config['compute_scale'], 
        quant_noise_predict=noise_config['predict']
    )

In [17]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [18]:
model.quantize_weight()

In [11]:
model.model.layers[0].self_attn.q_proj.quantizer.alpha

torch.Size([4096, 1])

In [12]:
model.save_pretrained('/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter')
tokenizer.save_pretrained('/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter')

('/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter/tokenizer_config.json',
 '/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter/special_tokens_map.json',
 '/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter/tokenizer.model',
 '/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter/added_tokens.json',
 '/home/Quantization/weights_study/weights/llama-2-7b-wrand-2bit-each-iter/tokenizer.json')

In [6]:
w = model.model.layers[0].self_attn.q_proj.weight

tensor([-0.0354, -0.0085,  0.0415,  ..., -0.0162,  0.1289, -0.0405],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [9]:
w.float()

tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)

In [7]:
model.model.layers[0].self_attn.q_proj.quantizer.quant_noise(w)

tensor([[-0.0042, -0.0109, -0.0019,  ...,  0.0014,  0.0023, -0.0038],
        [ 0.0149, -0.0063, -0.0031,  ..., -0.0135, -0.0069,  0.0061],
        [-0.0130,  0.0150,  0.0035,  ...,  0.0023,  0.0123, -0.0027],
        ...,
        [ 0.0064,  0.0075,  0.0090,  ...,  0.0037, -0.0255,  0.0149],
        [ 0.0237,  0.0002,  0.0055,  ..., -0.0184, -0.0078, -0.0099],
        [-0.0208, -0.0118,  0.0067,  ...,  0.0167,  0.0193, -0.0113]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<CopySlices>)

In [5]:
model.model.layers[0].self_attn.q_proj.quantizer.quant_scale

AttributeError: 'NoneType' object has no attribute 'quant_scale'

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QuantizedLinear(
            in_features=4096, out_features=4096, bias=False
            (quantizer): BitNoiseQuant()
          )
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): Ll

In [6]:
model.model.layers[0].self_attn.q_proj.quantizer.alpha

Parameter containing:
tensor([[0.0109],
        [0.0095],
        [0.0070],
        ...,
        [0.0167],
        [0.0137],
        [0.0167]], device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [6]:
model.model.layers[0].self_attn.q_proj.mask

tensor([True, True, True,  ..., True, True, True])

In [None]:
# if config['change_training_mode']:
#     outliers_config= config['outliers']
#     outlier_ids, _ = prepare_llama_quant(
#         outliers_config['path_to_act_scales'], 
#         outliers_config['fp_features_num']
#     )
#     training_mode = config['change_training_mode']
#     model.change_training_mode(outlier_ids, training_mode)

In [7]:
#Load and preprocessing dataset

# no default pad token for llama!
# here we add all special tokens again, because the default ones are not in the special_tokens_map
if isinstance(tokenizer, LlamaTokenizer) or isinstance(tokenizer, LlamaTokenizerFast):
    num_added_tokens = tokenizer.add_special_tokens({
        "bos_token": "<s>",
        "eos_token": "</s>",
        "unk_token": "<unk>",
        "pad_token": "<pad>",
    })
    assert num_added_tokens in [0, 1], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

print(len(tokenizer), embedding_size)

raw_datasets = load_hf_datasets(data_args)

# Preprocessing the datasets.
if "prompt" in raw_datasets["train"].column_names and "completion" in raw_datasets["train"].column_names:
    encode_function = partial(
        encode_with_prompt_completion_format,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
    )
elif "messages" in raw_datasets["train"].column_names:
    encode_function = partial(
        encode_with_messages_format,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
    )

lm_datasets = raw_datasets.map(
    encode_function,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=[name for name in raw_datasets["train"].column_names if name not in ["input_ids", "labels", "attention_mask"]],
    desc="Tokenizing and reformatting instruction data",
)

lm_datasets.set_format(type="pt")
lm_datasets = lm_datasets.filter(lambda example: (example['labels'] != -100).any())

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

32001 32000


Tokenizing and reformatting instruction data (num_proc=8): 100%|██████████| 30984/30984 [00:20<00:00, 1515.09 examples/s]
Tokenizing and reformatting instruction data (num_proc=8): 100%|██████████| 1631/1631 [00:01<00:00, 1217.31 examples/s]
Filter: 100%|██████████| 30984/30984 [00:02<00:00, 10556.06 examples/s]
Filter: 100%|██████████| 1631/1631 [00:00<00:00, 10992.61 examples/s]


In [8]:
for name, param in model.named_parameters():
    param.requires_grad = False

In [1]:
import torch
x = torch.ones((1, 2, 3))

In [7]:
x[:, :, torch.tensor([False, True, True])]

tensor([[[1., 1.],
         [1., 1.]]])

In [9]:
for name, param in model.named_parameters():
    name = name.replace('.weight', '')
    if name.find('model.layers.0.self_attn.q_proj') != -1:
    # if name.find('model.layers.0.mlp.up_proj') != -1:
        print(name)
        param.requires_grad_()
        

model.layers.0.self_attn.q_proj
model.layers.0.self_attn.q_proj.quantizer.alpha


In [15]:
name

'lm_head'

In [11]:
param

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       device='cuda:0', dtype=torch.bfloat16)

In [10]:
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable_params: {trainable_params}")

trainable_params: 16904192


In [11]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")
)

In [12]:
train_result = trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhelninmax[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.7303
2,1.9616


Checkpoint destination directory /home/exp_results/output/instruct/llama7b_test_noise/checkpoint-1 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/exp_results/output/instruct/llama7b_test_noise/checkpoint-2 already exists and is non-empty.Saving will proceed but saved results may be invalid.
