In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from baa import (
    QuantizedLinearLayerWithActivation,
    replace_linear_layer_with_activation,
    register_linear_layer_forward_hook,
    device_map,
    get_hidden_states_input,
    get_weights,
    add_custom_name_to_linear_layers,
    remove_all_hooks,
    chat_with_model,
    print_memory_usage,
    AccuracyBenchmark,
)
from baa.singletons import hidden_states, names
from datasets import load_dataset
import torch
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
model_name = "HuggingFaceTB/SmolLM-135M"
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [4]:
response = chat_with_model(model, tokenizer, "Hi there how are you?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Generated Output:

Output 1:
Hi there how are you?  I'm not sure if this is the right place to ask this
question, but I'm not sure if it's the right place to ask this question.  I'm
not sure if this is the right place to ask this question.  I'm not sure if this
is the right place to ask this question.  I'm not sure if this is the right
place to ask this question.  I'm not sure if this is the



In [5]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
benchmark = AccuracyBenchmark(model, tokenizer, dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
with torch.inference_mode():
    add_custom_name_to_linear_layers(model)
    # register_linear_layer_forward_hook(model, get_hidden_states_input)
    print("Original model accuracy:", benchmark.evaluate(sample_size=300))
    layers = []
    # add elemnt of names to string if element is not in string "mlp"
    exclude_list = []
    print("exclude_list:", exclude_list)
    print(
        f"hidden_states is empty: {not bool(hidden_states)}"
    )  # empty dicts resolve to False
    replace_linear_layer_with_activation(
        base_model=model,
        quantizer_class=QuantizedLinearLayerWithActivation,
        weight_bits=5,
        activation_bits=16,
        exclude_list=exclude_list,
        quantized=True,
    )
    remove_all_hooks(model)

    torch.cuda.empty_cache()

 91%|█████████ | 41/45 [00:29<00:02,  1.41it/s]


Original model accuracy: 0.47169398907103827
exclude_list: []
hidden_states is empty: True


In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): QuantizedLinearLayerWithActivation()
          (k_proj): QuantizedLinearLayerWithActivation()
          (v_proj): QuantizedLinearLayerWithActivation()
          (o_proj): QuantizedLinearLayerWithActivation()
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): QuantizedLinearLayerWithActivation()
          (up_proj): QuantizedLinearLayerWithActivation()
          (down_proj): QuantizedLinearLayerWithActivation()
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): QuantizedLinearLayerWithActivation()
)

In [12]:
with torch.inference_mode():
    print("Quantized model accuracy:", benchmark.evaluate(sample_size=200))


 85%|████████▍ | 22/26 [00:16<00:02,  1.37it/s]

Quantized model accuracy: 0.44423047296256357





In [9]:
response = chat_with_model(model, tokenizer, "Hi there how are you?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Output:

Output 1:
Hi there how are you? I am 18 years old and I am 18 years old I am 18 years old
I am 18 years old I am 18 years old I am 18 years old I am 18 years old I am 18
years old I am 18 years old I am 18 years old I am 18 years old I am 18



In [10]:
print(model.model.layers[0].self_attn.k_proj.weight.max())
print(model.model.layers[0].self_attn.k_proj.weight.min())

tensor(15, device='mps:0', dtype=torch.int8)
tensor(-16, device='mps:0', dtype=torch.int8)
