<a href="https://colab.research.google.com/github/RoboMaroof/LLM-Applications-Building-Blocks/blob/main/03_Quantization/02_GPTQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resources

https://towardsdatascience.com/the-ultimate-handbook-for-llm-quantization-88bb7cb0d9d7#d2c7

# Installs and Imports

In [None]:
!pip install auto_gptq
!pip install optimum
!pip install -U accelerate bitsandbytes datasets peft transformers

Collecting auto_gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting datasets (from auto_gptq)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge (from auto_gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto_gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting peft>=0.5.0 (from auto_gptq)
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->auto_gptq)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->auto_gptq)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->auto_gptq)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-ma

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    GPTQConfig,
)
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import torch
import gc
from google.colab import userdata



# Comparison functions

## Model size

In [None]:
def get_model_size(model):
    return sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2

## Inference test

In [None]:
def inference_test(model):
    # Performance comparison (inference speed)
    input_text = "Once upon a time"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    print("\nInference speed test:")
    with torch.no_grad():
        # Warm-up run
        _ = model.generate(input_ids, max_new_tokens=50)

        # Timed run
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)

        start_time.record()
        output = model.generate(input_ids, max_new_tokens=50)
        end_time.record()

        torch.cuda.synchronize()
        print(f"Generation time: {start_time.elapsed_time(end_time):.2f} ms")

    print("\nGenerated text:")
    print(tokenizer.decode(output[0], skip_special_tokens=True))

# Model

In [None]:
model_name = "google/gemma-2b-it"

## Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=userdata.get('HF_TOKEN')
)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# Full precision model

## Load Model

In [None]:
full_precision_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    token=userdata.get('HF_TOKEN')
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Model size

In [None]:
print(f"Quantized model size: {get_model_size(full_precision_model):.2f} MB")

Quantized model size: 9560.29 MB


## Inference speed

In [None]:
inference_test(full_precision_model)


Inference speed test:
Generation time: 38333.68 ms

Generated text:
Once upon a time, in a quaint village nestled amidst rolling hills, lived a young woman named Elara. With eyes as bright as the morning sun and a smile that could melt the iciest of winter days, Elara possessed a heart that was as warm as the


# Clear Memory

In [None]:
del full_precision_model
gc.collect()
torch.cuda.empty_cache()

# GPTQ Model

## Quantization config

In [None]:
quant_config = GPTQConfig(
    bits=4,
    dataset = "c4",
    tokenizer=tokenizer
)

## Load Model

In [None]:
model_GPTQ_int4 = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quant_config,
    token=userdata.get('HF_TOKEN')
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Quantizing model.layers blocks :   0%|          | 0/18 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]



## Model size

In [None]:
print(f"Quantized model size: {get_model_size(model_GPTQ_int4):.2f} MB")

Quantized model size: 1000.14 MB


## Inference speed

In [None]:
inference_test(model_GPTQ_int4)


Inference speed test:
Generation time: 12661.32 ms

Generated text:
Once upon a time, in a quaint village nestled amidst rolling hills, there lived a young woman named Elara. With eyes as bright as the morning dew and hair as dark as the night sky, Elara possessed a heart of gold and a spirit that soared beyond the


# Push quantized model to HuggingFace

In [None]:
model_GPTQ_int4.push_to_hub("Maroof-Mohammed/gemma-2b-GPTQ", check_pr=True, token=userdata.get('HF_TOKEN_WRITE'))
tokenizer.push_to_hub("Maroof-Mohammed/gemma-2b-GPTQ", check_pr=True, token=userdata.get('HF_TOKEN_WRITE'))

model.safetensors:   0%|          | 0.00/2.08G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Maroof-Mohammed/gemma-2b-GPTQ/commit/58ed817eb6a2dc7ccbf8301921dd7ce11b2fbe32', commit_message='Upload tokenizer', commit_description='', oid='58ed817eb6a2dc7ccbf8301921dd7ce11b2fbe32', pr_url=None, pr_revision=None, pr_num=None)

# Save quantized model locally

In [None]:
# Save the quantized model
model_GPTQ_int4.save_pretrained("./model_GPTQ_int4", safe_serialization=True)
tokenizer.save_pretrained("./model_GPTQ_int4")

('./model_GPTQ_int4/tokenizer_config.json',
 './model_GPTQ_int4/special_tokens_map.json',
 './model_GPTQ_int4/tokenizer.model',
 './model_GPTQ_int4/added_tokens.json',
 './model_GPTQ_int4/tokenizer.json')