In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from baa import (
    QuantizedLinearLayerWithActivation,
    replace_linear_layer_with_activation,
    register_linear_layer_forward_hook,
    device_map,
    get_hidden_states_input,
    add_custom_name_to_linear_layers,
    remove_all_hooks,
    chat_with_model,
    print_memory_usage,
    AccuracyBenchmark,
)
from baa.singletons import hidden_states
from datasets import load_dataset
import torch
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
# model_name = "HuggingFaceTB/SmolLM-135M"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
response = chat_with_model(model, tokenizer, "Hi there how are you?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Generated Output:

Output 1:
Hi there how are you? I am a new student in a university and I am having a lot
of questions about the university and its resources. I am trying to get a better
understanding of what I need to do to get started.  I would like to know about
the different types of student loans available to me and the process of applying
for them. I am a first-time student and I am not sure what I need to do to get
approved for a student loan.  I am in the United



In [4]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
benchmark = AccuracyBenchmark(model, tokenizer, dataset)

In [5]:
with torch.inference_mode():
    # register_linear_layer_forward_hook(model, get_hidden_states_input)
    print("Original model accuracy:", benchmark.evaluate(sample_size=200))
    # print(hidden_states)
    add_custom_name_to_linear_layers(model)
    print(f"hidden_states is empty: {not bool(hidden_states)}") # empty dicts resolve to False
    replace_linear_layer_with_activation(
        base_model=model,
        quantizer_class=QuantizedLinearLayerWithActivation,
        weight_bits=8,
        activation_bits=4,
        exclude_list=[],
        quantized=True,
    )
    remove_all_hooks(model)

    torch.cuda.empty_cache()
    print("Quantized model accuracy:", benchmark.evaluate(sample_size=200))


  0%|                                                | 0/24 [00:00<?, ?it/s]

 83%|████████████████████████████████▌      | 20/24 [00:51<00:10,  2.59s/it]


Original model accuracy: 0.5021065675340768
hidden_states is empty: True


 83%|████████████████████████████████▌      | 20/24 [00:52<00:10,  2.64s/it]

Quantized model accuracy: 0.5018587360594795





In [6]:
response = chat_with_model(model, tokenizer, "Hi there how are you?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Output:

Output 1:
Hi there how are you? I am a new student and I am excited to start this new
chapter of my life. I am eager to learn and make new friends. I am a bit nervous
about making new friends, but I know that it's an important part of this new
chapter.  I am a bit worried about not knowing anyone in the class, and I don't
want to feel like I'm the only one who is feeling lonely. But I also know that I
have to be strong



In [7]:
print(model.model.layers[0].self_attn.k_proj.weight.max())
print(model.model.layers[0].self_attn.k_proj.weight.min())

tensor(127., device='cuda:0')
tensor(-127., device='cuda:0')
