<a href="https://colab.research.google.com/github/RoboMaroof/LLM-Applications-Building-Blocks/blob/main/03_Quantization/06_HQQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resources

https://towardsdatascience.com/the-ultimate-handbook-for-llm-quantization-88bb7cb0d9d7#056e

https://huggingface.co/docs/transformers/main/en/quantization/hqq

https://github.com/mobiusml/hqq

# Installs and Imports

In [None]:
!pip3 install -U transformers accelerate
!pip install hqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, HqqConfig
import torch
import gc
from google.colab import userdata

# Comparison functions

## Model size

In [None]:
def get_model_size(model):
    return sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2

## Inference test

In [None]:
def inference_test(model):
    # Performance comparison (inference speed)
    input_text = "Once upon a time"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    print("\nInference speed test:")
    with torch.no_grad():
        # Warm-up run
        _ = model.generate(input_ids, max_new_tokens=50)

        # Timed run
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)

        start_time.record()
        output = model.generate(input_ids, max_new_tokens=50)
        end_time.record()

        torch.cuda.synchronize()
        print(f"Generation time: {start_time.elapsed_time(end_time):.2f} ms")

    print("\nGenerated text:")
    print(tokenizer.decode(output[0], skip_special_tokens=True))

# Model

In [None]:
model_name = "google/gemma-2b-it"

## Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=userdata.get('HF_TOKEN')
)

# Full precision model

## Load Model

In [None]:
full_precision_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    token=userdata.get('HF_TOKEN')
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Model size

In [None]:
print(f"Full precision model size: {get_model_size(full_precision_model):.2f} MB")

Full precision model size: 9560.29 MB


## Inference speed

In [None]:
inference_test(full_precision_model)


Inference speed test:
Generation time: 2149.35 ms

Generated text:
Once upon a time, in a quaint village nestled amidst rolling hills, lived a young woman named Elara. With eyes as bright as the morning sun and a smile that could melt the iciest of winter days, Elara possessed a heart that was as warm as the


# Clear Memory

In [None]:
del full_precision_model
gc.collect()
torch.cuda.empty_cache()

# HQQ Model

## Quantization config

In [None]:
from transformers import AutoModelForCausalLM, HqqConfig

# All linear layers will use the same quantization config
quant_config = HqqConfig(nbits=4, group_size=64, quant_zero=False, quant_scale=False, axis=1)

# Load and quantize
model_HQQ_int4 = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda",
    quantization_config=quant_config
)

## Model size

In [None]:
print(f"Quantized model size: {get_model_size(model_HQQ_int4):.2f} MB")

## Inference speed

In [None]:
inference_test(model_HQQ_int4)

# Save quantized model locally