In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig

In [None]:
model_path = "meta-llama/Llama-2-7b-chat-hf"
quant_path = "Llama-2-7b-chat-hf-awq"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}

# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True})
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Quantize model
model.quantize(tokenizer, quant_config=quant_config)

In [None]:
# Create a Transformers-compatible quantization configuration
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# Pass the new quantization configuration to the quantized model
model.model.config.quantization_config = quantization_config

# Save the quantized model weights
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)