In [1]:
import os
import time

os.environ["CUDA_VISIBLE_DEVICES"] = ''

SAVING_DIR='/home/data/taxonomy/'
os.environ["TRANSFORMERS_CACHE"] = SAVING_DIR + "hf_cache/"
os.environ["HF_HOME"] = SAVING_DIR + "hf_cache/"

import torch
from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, AutoConfig, PretrainedConfig, AutoTokenizer
from peft import PeftModel 
import math
import re
from operator import attrgetter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_fp_inds_for_quik(path_to_act_scales, fp_features_num): 
    act_scales = torch.load(path_to_act_scales) 
    fp_indices_in_lin_layers = {k: torch.sort(v)[1][-fp_features_num:] for k, v in act_scales.items()} 
    return fp_indices_in_lin_layers

def extract_pattern(s):
    pattern = r'model\.layers\.(\d+)\.(.+)'

    # Perform regex search
    match = re.search(pattern, s)

    layer_number = match.group(1)
    rest_of_string = match.group(2)
    return layer_number, rest_of_string

def get_fp_llama(path_to_act_scales, fp_features_num):
    fp_indices_in_lin_layers = get_fp_inds_for_quik(path_to_act_scales, fp_features_num)

    outlier_ids = {}
    for k, v in fp_indices_in_lin_layers.items():

        if not 'layer' in  k:
            continue 

        layer_number, rest_of_string = extract_pattern(k)
        
        if int(layer_number) not in outlier_ids.keys():
            outlier_ids[int(layer_number)] = {rest_of_string: v.tolist()}
        else:
            outlier_ids[int(layer_number)][rest_of_string] = v.tolist()

    return outlier_ids


def make_layer_bits(outlier_ids, q=4, k=4, v=4, o=4, down=4, gate=4, up=4):
    layer_bits = {}
    for layer_num in outlier_ids.keys():
        layer_bits[layer_num] = {}
        layer_bits[layer_num]['self_attn.q_proj'] = q
        layer_bits[layer_num]['self_attn.k_proj'] = k
        layer_bits[layer_num]['self_attn.v_proj'] = v
        layer_bits[layer_num]['self_attn.o_proj'] = o
        layer_bits[layer_num]['mlp.gate_proj'] = gate
        layer_bits[layer_num]['mlp.up_proj'] = up
        layer_bits[layer_num]['mlp.down_proj'] = down

    return layer_bits

def prepare_llama_ste(path_to_act_scales, fp_features_num, **kwargs):
    outlier_ids = get_fp_llama(path_to_act_scales, fp_features_num)
    layer_bits = make_layer_bits(outlier_ids, **kwargs)

    return outlier_ids, layer_bits

def round_pass(x):
    y = x.round()
    y_grad = x
    return (y - y_grad).detach() + y_grad

def quantize(
    X: torch.Tensor = None,
    B: int = 16,
    ) -> torch.Tensor:

    thd_neg = -(2 ** (B - 1)) + 1
    thd_pos = 2 ** (B - 1) - 1


    scale = (X.max() - X.min())/(thd_neg - thd_pos)
    X = round_pass(X/scale)
    X = torch.clip(X, thd_neg, thd_pos)
    return scale*X

@torch.jit.script
def quantize_over_blocks(
    X: torch.Tensor,
    B: int = 16,
    block_size: int = 4,  # Assuming block size along the first dimension
) -> torch.Tensor:
    # Dimensions for the input tensor
    D = X.shape[1]

    # Quantization thresholds
    thd_neg = -(2 ** (B - 1)) + 1
    thd_pos = 2 ** (B - 1) - 1
    # Initialize an output tensor
    X_quantized = torch.zeros_like(X)
    
    # Calculate number of blocks
    num_blocks = (D + block_size - 1) // block_size  # Account for the last block that might be smaller
    
    for i in range(num_blocks):
        # Extract the block
        start_idx = i * block_size
        end_idx = min((i + 1) * block_size, D)
        block = X[:,start_idx:end_idx]
        
        # Scale for the current block
        scale = (block.max() - block.min()) / (thd_pos - thd_neg)
        block = round_pass(block / scale)
        block = torch.clip(block, thd_neg, thd_pos)
        
        # Store the quantized block back into the tensor
        X_quantized[:,start_idx:end_idx] = scale * block
    
    return X_quantized

def quantize_over_blocks_quik(
    X: torch.Tensor,
    B: int = 16,
    block_size: int = 4,
) -> torch.Tensor:
    # Dimensions for the input tensor
    D = X.shape[1]
    quik_quantizer = WeightQuantizer(    )

    quik_quantizer.configure(bits=B)
    # Initialize an output tensor
    X_quantized = torch.zeros_like(X)
    
    # Calculate number of blocks
    num_blocks = (D + block_size - 1) // block_size  # Account for the last block that might be smaller
    
    for i in range(num_blocks):
        # Extract the block
        start_idx = i * block_size
        end_idx = min((i + 1) * block_size, D)
        block = X[:,start_idx:end_idx]

        quik_quantizer.find_params(block)
        
        # Store the quantized block back into the tensor
        X_quantized[:,start_idx:end_idx] = quik_quantizer.quantize(block)
    
    return X_quantized

def quantize_with_outliers(X: torch.Tensor,
    B: int = 16,
    block_size: int = 4,
    idx: torch.Tensor = torch.tensor([])):

    if len(idx) == 0:
        print('Empty outlier idx')
        return quantize_over_blocks(X, B=B, block_size=block_size)
    
    mask = torch.ones(X.size(1), dtype=torch.bool)
    mask[idx] = False

    # Split the tensor into quantize and no_quantize parts
    X_quantize = X[:, mask]
    X_no_quantize = X[:, ~mask]

    # Quantize the part that needs quantization
    X_quantized = quantize_over_blocks_quik(X_quantize, B=B, block_size=block_size)

    # Prepare a tensor to hold the result
    X_result = torch.empty_like(X)

    # Place the quantized and unquantized parts back in their original positions
    X_result[:, mask] = X_quantized
    X_result[:, ~mask] = X_no_quantize

    return X_result


In [3]:
from QUIK.experiments.fake_quant.quant import WeightQuantizer

In [4]:
model = torch.load('/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj.pt', map_location='cpu')

In [23]:
init_w = base_model.model.layers[30].self_attn.q_proj.weight.data

In [5]:
outlier_ids, layer_bits = prepare_llama_ste('/home/LLM_Compression/QUIK/experiments/act_scales/Llama-2-7b-hf.pt', fp_features_num=128)

In [27]:
idx = outlier_ids[30]['self_attn.q_proj']
mask = torch.ones(init_w.size(1), dtype=torch.bool)
mask[idx] = False

# Split the tensor into quantize and no_quantize parts
X_quantize = init_w[:, mask]
X_no_quantize = init_w[:, ~mask]

In [28]:
X_no_quantize.size()

torch.Size([4096, 128])

In [29]:
len(torch.unique(X_quantize[:2,:]))

31

In [30]:
len(torch.unique(X_no_quantize[:,0]))

3355

In [80]:
w_copy = torch.clone(init_w)

In [83]:
quantized_with_outliers = quantize_with_outliers(w_copy, block_size=1, B=4, idx=outlier_ids[30]['self_attn.q_proj'])

In [84]:
quantized_with_outliers[torch.isclose(init_w, quantized_with_outliers).sum(dim=0) != 4096][0]

tensor([ 0.0000, -0.0131, -0.0110,  ..., -0.0000, -0.0106, -0.0000],
       dtype=torch.float16)

In [85]:
init_w[torch.isclose(init_w, quantized_with_outliers).sum(dim=0) != 4096][0]

tensor([ 0.0000, -0.0114, -0.0057,  ..., -0.0057, -0.0057, -0.0000],
       dtype=torch.float16)

In [58]:
(torch.isclose(init_w, quantized_with_outliers).sum(dim=0) == 4096).sum()

tensor(128)

In [59]:
(~torch.isclose(init_w, quantized_with_outliers)).sum() / (4096 * 4096)

tensor(0.8291)

In [2]:
checkpoint = '/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj'
peft_model_id = "/home/data/compression/clip_sm_cache/fine_tuning/lora/checkpoint-174" 

token = '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' 


In [3]:


base_model = AutoModelForCausalLM.from_pretrained(
            checkpoint,
            use_auth_token=token)


Loading checkpoint shards: 100%|██████████| 3/3 [01:42<00:00, 34.27s/it]


In [10]:
base_model = base_model.to('cuda')

In [11]:
base_model.train()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

In [4]:
base_model.save_pretrained('./logs/test_save/')

In [17]:
layer_bits

{0: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_proj': 4},
 1: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_proj': 4},
 2: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_proj': 4},
 3: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_proj': 4},
 4: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_proj': 4},
 5: {'self_attn.q_proj': 4,
  'self_attn.k_proj': 4,
  'self_attn.v_proj': 4,
  'self_attn.o_proj': 4,
  'mlp.gate_proj': 4,
  'mlp.up_proj': 4,
  'mlp.down_pro

In [6]:
model.enable_ste(outlier_ids=outlier_ids, layer_bit=layer_bits)

In [7]:
from transformers import Trainer, TrainingArguments

In [9]:
trainer = Trainer(model=model)

In [11]:
trainer.save_model()

In [34]:
base_model.save_pretrained('./logs/test_save/')

In [13]:
tokens = torch.tensor([[1, 2, 3]]).to('cuda')

In [14]:
out = base_model(tokens, labels=tokens)

In [15]:
out

CausalLMOutputWithPast(loss=tensor(10.5419, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[-4.6483, -2.5449,  1.6972,  ..., -3.2146, -4.0683, -2.8675],
         [-0.9074, 15.8066,  2.2049,  ...,  1.1661,  0.2635,  0.3063],
         [-9.6864, -4.7050,  0.4944,  ..., -2.1131, -5.0413, -3.2411]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.4536, -0.0150,  0.0482,  ...,  0.0504, -0.0268, -0.1293],
          [-0.7511,  0.3585, -0.3929,  ..., -0.2217,  0.1841, -0.0919],
          [ 0.1734, -0.1501, -0.2456,  ...,  0.7295, -0.2216,  1.1152]],

         [[ 1.3434,  1.0816, -0.4259,  ...,  0.4787, -0.3052,  0.4842],
          [-0.3260,  0.0481, -1.2391,  ..., -0.3518, -0.1048, -0.2449],
          [-0.2876,  0.1611, -0.0355,  ..., -0.5685,  0.8668, -0.5262]],

         [[ 0.0021, -0.2538, -0.4174,  ..., -0.1237,  0.4983,  0.7593],
          [-0.2172,  0.0547,  0.2124,  ...,  1.0920,  1.3323,  1.3300],
          [ 0.7287,  0.7032,  0.12

In [45]:
lin = torch.nn.Linear(3, 5)

In [12]:
for layer_name in outlier_ids[0].keys():
    print(attrgetter(layer_name)(base_model.model.layers[0]))

Linear(in_features=4096, out_features=4096, bias=False)
Linear(in_features=4096, out_features=4096, bias=False)
Linear(in_features=4096, out_features=4096, bias=False)
Linear(in_features=4096, out_features=4096, bias=False)
Linear(in_features=4096, out_features=11008, bias=False)
Linear(in_features=4096, out_features=11008, bias=False)
Linear(in_features=11008, out_features=4096, bias=False)


In [13]:
attrgetter(layer_name)(base_model.model.layers[0])

Linear(in_features=11008, out_features=4096, bias=False)

In [29]:
outlier_ids[0].keys()

dict_keys(['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj', 'mlp.gate_proj', 'mlp.up_proj', 'mlp.down_proj'])

In [47]:
def round_pass(x):
    y = x.round()
    y_grad = x
    return (y - y_grad).detach() + y_grad

def quantize(
    X: torch.Tensor = None,
    B: int = 16,
    ) -> torch.Tensor:

    thd_neg = -(2 ** (B - 1)) + 1
    thd_pos = 2 ** (B - 1) - 1


    scale = (X.max() - X.min())/(thd_neg - thd_pos)
    X = round_pass(X/scale)
    X = torch.clip(X, thd_neg, thd_pos)
    return scale*X

@torch.jit.script
def quantize_over_blocks(
    X: torch.Tensor,
    B: int = 16,
    block_size: int = 4,  # Assuming block size along the first dimension
) -> torch.Tensor:
    # Dimensions for the input tensor
    D = X.shape[0]

    # Quantization thresholds
    thd_neg = -(2 ** (B - 1)) + 1
    thd_pos = 2 ** (B - 1) - 1
    # Initialize an output tensor
    X_quantized = torch.zeros_like(X)
    
    # Calculate number of blocks
    num_blocks = (D + block_size - 1) // block_size  # Account for the last block that might be smaller
    
    for i in range(num_blocks):
        # Extract the block
        start_idx = i * block_size
        end_idx = min((i + 1) * block_size, D)
        block = X[start_idx:end_idx]
        


        # Scale for the current block
        scale = (block.max() - block.min()) / (thd_pos - thd_neg)
        block = round_pass(block / scale)
        block = torch.clip(block, thd_neg, thd_pos)
        
        # Store the quantized block back into the tensor
        X_quantized[start_idx:end_idx] = scale * block
    
    return X_quantized

def quantize_with_outliers(X: torch.Tensor,
    B: int = 16,
    block_size: int = 4,
    idx: torch.Tensor = torch.tensor([])):

    if len(idx) == 0:
        print('Empty outlier idx')
        return quantize_over_blocks(X, B=B, block_size=block_size)
    
    mask = torch.ones(X.size(1), dtype=torch.bool)
    mask[idx] = False

    # Split the tensor into quantize and no_quantize parts
    X_quantize = X[:, mask]
    X_no_quantize = X[:, ~mask]

    # Quantize the part that needs quantization
    X_quantized = quantize_over_blocks(X_quantize, B=4, block_size=16)

    # Prepare a tensor to hold the result
    X_result = torch.empty_like(X)

    # Place the quantized and unquantized parts back in their original positions
    X_result[:, mask] = X_quantized
    X_result[:, ~mask] = X_no_quantize

    return X_result

In [50]:
idx = outlier_ids[0]['mlp.down_proj']


In [51]:
len(idx)

128

In [52]:
torch.manual_seed(42)

lin = torch.nn.Linear(11008, 4096).to('cuda')
x = torch.rand(128, 11008).to('cuda')

In [53]:
%%time
lin.weight.data = quantize_with_outliers(lin.weight.data, B=4, block_size=64, idx=idx)

CPU times: user 53 ms, sys: 1.39 ms, total: 54.3 ms
Wall time: 53.4 ms


In [163]:
out = lin(x)
out.sum().backward()

In [157]:
lin.weight.grad

tensor([[64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876],
        [64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876],
        [64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876],
        ...,
        [64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876],
        [64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876],
        [64.6666, 62.0538, 63.0199,  ..., 66.7285, 65.0555, 56.5876]])

In [54]:
lin.weight.grad

tensor([[1.7605, 0.6532, 1.7900, 1.1379, 0.6800],
        [1.7605, 0.6532, 1.7900, 1.1379, 0.6800],
        [1.7605, 0.6532, 1.7900, 1.1379, 0.6800],
        [1.7605, 0.6532, 1.7900, 1.1379, 0.6800],
        [1.7605, 0.6532, 1.7900, 1.1379, 0.6800]])

In [7]:
checkpoint = 'meta-llama/Llama-2-7b-hf'

tok = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
tok.save_pretrained('/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj')

('/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj/tokenizer_config.json',
 '/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj/special_tokens_map.json',
 '/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj/tokenizer.model',
 '/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj/added_tokens.json',
 '/home/data/compression/quik_cache/llama7b_4w_16a_128fp_true_2b-k_proj/tokenizer.json')

In [7]:
tokens = torch.tensor([[0,1,2]])

In [6]:
base_model.config.__dict__['clip_softmax_gamma']

0

In [18]:
for layer in base_model.model.layers:
    layer.self_attn.enable_clip = False

In [9]:
out_noclip = base_model(tokens)

In [12]:
for layer in base_model.model.layers:
    layer.self_attn.enable_clip = True

In [13]:
out_clip = base_model(tokens)

In [17]:
(out_noclip['logits'] != out_clip['logits']).sum()

tensor(0)

In [22]:
for layer in base_model.model.layers:
    print(layer.self_attn.clip_softmax_eta != 1)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [3]:

@torch.no_grad()   
def create_mask(weight, outlier_fraction):

    w = torch.clone(weight) 
    w_flat = w.view(-1) 
    lower_threshold, upper_threshold = ( 
        torch.kthvalue( 
            w_flat, 
            int(w_flat.numel() * outlier_fraction / 2), 
        )[0], 
        torch.kthvalue( 
            w_flat, 
            int(w_flat.numel() * (1 - outlier_fraction / 2)), 
        )[0], 
    ) 

    outliers = (w < lower_threshold) | (w > upper_threshold) 

    return ~outliers.detach()


In [4]:
outlier_fraction = 0.05 
weight = base_model.model.layers[0].self_attn.q_proj.weight.data 

mask = create_mask(weight, outlier_fraction)

In [5]:
for name, param in base_model.named_parameters():
    if 'layers' in name:
        print(name)
        mask = create_mask(param.data, outlier_fraction)
        break

model.layers.0.self_attn.q_proj.weight


In [9]:
param.data *

tensor(0.0498)

In [6]:
param

Parameter containing:
tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [-0.0146,  0.0126,  0.0005,  ...,  0.0063,  0.0188, -0.0031],
        ...,
        [ 0.0013,  0.0109, -0.0003,  ...,  0.0098, -0.0298,  0.0097],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       requires_grad=True)

In [None]:

model = PeftModel.from_pretrained(base_model, peft_model_id) 
merged_model = model.merge_and_unload()

In [4]:
merged_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

In [5]:
merged_model.save_pretrained('../data/compression/clip_sm_cache/fine_tuning/lora/ckpt1_sm_gamma-2e-2', from_pt=True)

In [8]:
del base_model

In [7]:
del merged_model

In [9]:

model = LlamaForCausalLM.from_pretrained(
        '../data/compression/clip_sm_cache/fine_tuning/lora/ckpt1_sm_gamma-2e-2',
        device_map="cpu",
    )

Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00,  1.04it/s]


In [10]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

: 

In [5]:
tokens

tensor([[1, 2]])

In [6]:
out = model(tokens)

In [7]:
out

CausalLMOutputWithPast(loss=None, logits=tensor([[[-12.9832,  -7.4134,  -0.4328,  ...,  -6.8297,  -8.0879,  -7.5863],
         [  1.7376,  23.8052,   1.6609,  ...,   0.8430,   1.0285,   0.7649]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.4510, -0.0164,  0.0498,  ...,  0.0514, -0.0235, -0.1267],
          [-0.7527,  0.3631, -0.3988,  ..., -0.2205,  0.1796, -0.0891]],

         [[ 1.3476,  1.0810, -0.4292,  ...,  0.4777, -0.3054,  0.4798],
          [-0.3353,  0.0538, -1.2545,  ..., -0.3505, -0.1085, -0.2424]],

         [[ 0.0030, -0.2562, -0.4154,  ..., -0.1226,  0.4979,  0.7620],
          [-0.2193,  0.0564,  0.2123,  ...,  1.0906,  1.3349,  1.3314]],

         ...,

         [[ 0.0239, -0.0053,  0.0390,  ...,  0.4867,  0.9620, -0.4022],
          [ 0.0319,  0.0271,  0.0163,  ...,  0.4254,  0.4127, -0.1228]],

         [[ 0.1723, -0.5391, -0.3137,  ..., -0.2754,  0.1376,  0.1413],
          [-0.0779,  0.0152, -0.2598,  ..., -0.5077, -0.0017, -0.0434]],

 

In [8]:
model.model.layers[0].self_attn.clip_softmax_eta

1

In [4]:
model.set_clipped_sm(gamma=-5, eta=0.5)

In [5]:
tokens = torch.tensor([[1, 2]])
out = model(tokens)
out

CausalLMOutputWithPast(loss=None, logits=tensor([[[-9.9210, -7.2828,  3.1770,  ..., -2.9533, -5.8889, -3.5037],
         [ 6.9529, 29.6448,  1.4662,  ...,  3.5922,  2.7581,  3.2546]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.4510, -0.0164,  0.0498,  ...,  0.0514, -0.0235, -0.1267],
          [-0.7527,  0.3631, -0.3988,  ..., -0.2205,  0.1796, -0.0891]],

         [[ 1.3476,  1.0810, -0.4292,  ...,  0.4777, -0.3054,  0.4798],
          [-0.3353,  0.0538, -1.2545,  ..., -0.3505, -0.1085, -0.2424]],

         [[ 0.0030, -0.2562, -0.4154,  ..., -0.1226,  0.4979,  0.7620],
          [-0.2193,  0.0564,  0.2123,  ...,  1.0906,  1.3349,  1.3314]],

         ...,

         [[ 0.0239, -0.0053,  0.0390,  ...,  0.4867,  0.9620, -0.4022],
          [ 0.0319,  0.0271,  0.0163,  ...,  0.4254,  0.4127, -0.1228]],

         [[ 0.1723, -0.5391, -0.3137,  ..., -0.2754,  0.1376,  0.1413],
          [-0.0779,  0.0152, -0.2598,  ..., -0.5077, -0.0017, -0.0434]],

         [[-0

In [4]:
config = AutoConfig.from_pretrained(checkpoint)

In [10]:
config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.37.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
-12 / 512

-0.0234375

In [6]:
config.clip_softmax_gamma = -12 / 512

config.clip_softmax_eta = 1.003

In [12]:
model.model.layers[0].self_attn.clip_softmax_gamma

0

In [7]:
config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "clip_softmax_eta": 1.003,
  "clip_softmax_gamma": -0.0234375,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.37.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [15]:
model_c = LlamaForCausalLM.from_pretrained(
        checkpoint,
        device_map="cpu",
        use_auth_token=token,
        config=config
    )



OSError: Incorrect path_or_model_id: 'LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "clip_softmax_eta": 1.003,
  "clip_softmax_gamma": -0.0234375,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.37.0",
  "use_cache": true,
  "vocab_size": 32000
}
'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [8]:
model_clipped = LlamaForCausalLM(config)

ValueError: Parameter config in `LlamaForCausalLM(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = LlamaForCausalLM.from_pretrained(PRETRAINED_MODEL_NAME)`