<a href="https://colab.research.google.com/github/RoboMaroof/LLM-Applications-Building-Blocks/blob/main/03_Quantization/03_AWQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resources

https://huggingface.co/docs/transformers/main/en/quantization/awq

https://towardsdatascience.com/the-ultimate-handbook-for-llm-quantization-88bb7cb0d9d7#d7f1

# Installs and Imports

In [1]:
!pip install autoawq transformers accelerate

Collecting autoawq
  Downloading autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl.metadata (18 kB)
Collecting datasets (from autoawq)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting zstandard (from autoawq)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting autoawq-kernels (from autoawq)
  Downloading autoawq_kernels-0.0.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->autoawq)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting n

In [2]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc
from google.colab import userdata

# Comparison functions

## Model size

In [3]:
def get_model_size(model):
    return sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2

## Inference test

In [4]:
def inference_test(model):
    # Performance comparison (inference speed)
    input_text = "Once upon a time"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    print("\nInference speed test:")
    with torch.no_grad():
        # Warm-up run
        _ = model.generate(input_ids, max_new_tokens=50)

        # Timed run
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)

        start_time.record()
        output = model.generate(input_ids, max_new_tokens=50)
        end_time.record()

        torch.cuda.synchronize()
        print(f"Generation time: {start_time.elapsed_time(end_time):.2f} ms")

    print("\nGenerated text:")
    print(tokenizer.decode(output[0], skip_special_tokens=True))

# Model

In [5]:
model_name = "google/gemma-2b-it"

## Load Tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=userdata.get('HF_TOKEN')
)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# Full precision model

## Load Model

In [7]:
full_precision_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    token=userdata.get('HF_TOKEN')
)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## Model size

In [8]:
print(f"Full precision model size: {get_model_size(full_precision_model):.2f} MB")

Full precision model size: 9560.29 MB


## Inference speed

In [9]:
inference_test(full_precision_model)


Inference speed test:
Generation time: 2152.39 ms

Generated text:
Once upon a time, in a quaint village nestled amidst rolling hills, lived a young woman named Elara. With eyes as bright as the morning sun and a smile that could melt the iciest of winter days, Elara possessed a heart that was as warm as the


# Clear Memory

In [10]:
del full_precision_model
gc.collect()
torch.cuda.empty_cache()

# AWQ Model

## Quantization config

In [11]:
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4
}

## Load Model

In [12]:
model_AWQ_int4 = AutoAWQForCausalLM.from_pretrained(
    model_name
)

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

gemma-2b-it.gguf:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Quantize

In [13]:
model_AWQ_int4.quantize(tokenizer, quant_config=quant_config)

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214670 [00:00<?, ? examples/s]

AWQ: 100%|██████████| 18/18 [17:50<00:00, 59.45s/it]


## Model size

In [14]:
print(f"Quantized model size: {get_model_size(model_AWQ_int4):.2f} MB")

Quantized model size: 1000.14 MB


## Inference speed

In [16]:
import time
def inference_test_AWQ(model, num_runs=10):
    input_text = "Once upon a time"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the selected device

    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Warm-up runs (to stabilize GPU performance)
    for _ in range(2):
        _ = model.generate(**inputs, max_length=50)

    # Measure inference speed
    start_time = time.time()
    for _ in range(num_runs):
        _ = model.generate(**inputs, max_length=50)
    total_time = time.time() - start_time

    avg_time_per_run = total_time / num_runs
    print(f"Average inference time: {avg_time_per_run:.4f} seconds per run over {num_runs} runs on {device}")

In [17]:
inference_test_AWQ(model_AWQ_int4)

Average inference time: 1.4997 seconds per run over 10 runs on cuda


# Save quantized model locally

In [25]:
from transformers import AutoConfig
import os

save_directory = "./awq_quantized_model"
os.makedirs(save_directory, exist_ok=True)

# Save the model
model_path = os.path.join(save_directory, "pytorch_model.bin")
torch.save(model_AWQ_int4.state_dict(), model_path)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the config
config = AutoConfig.from_pretrained(model_name)
config.save_pretrained(save_directory)

In [None]:
model = AutoModelForCausalLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Define the repository name (it should be unique and follow the format 'username/repo_name')
repo_name = "Maroof-Mohammed/gemma-2b-GPTQ"

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True, token=userdata.get('HF_TOKEN_WRITE'))
tokenizer.push_to_hub(repo_name, check_pr=True, token=userdata.get('HF_TOKEN_WRITE'))