In [7]:
# !pip install transformers huggingface_hub 
# !pip install -e ../.

In [8]:
# Hugging face hub login
import huggingface_hub
with open("../.env", "r") as f:
    token = f.read().strip().split('=')[1]
huggingface_hub.login(token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [9]:
# Load llama 3.1 8B model and quantize it with Auto-AdpQ
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_adpq import Auto_AdpQ, AutoAdpQConfig
import os
import glob
import gc
import torch

In [None]:
# I have it locally stored on my computer but if it's not the case, it will be downloaded from HF
save_path = "../../MasterThesis/experiments/weights/meta-llama/Meta-Llama-3.1-8B-weights" 
# Check if the model is present
files = glob.glob(os.path.join(save_path, "*.safetensors"))
if len(files) == 0:
    model_name = "meta-llama/Llama-3.1-8B"
else:
    model_name = save_path
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# Setup Auto-AdpQ configuration
adpq_config = AutoAdpQConfig(
    group_size = 128,
    n_iters = 100,
    alpha = 0.08,
    device = "cpu",
    q_bit = 4,
    data_packing = True,
    symmetrical_quantization = True
)

adpq = Auto_AdpQ(config=adpq_config)

In [12]:
# Quantize the model
quantized_model = adpq.quantize_model(model)

# Save the quantized model
adpq.save_pretrained("quantized_meta_llama_3.1_8B_adpq")

2025-11-29 13:36:31 - auto_adpq.module - INFO - Quantizing layer: model.layers.0.self_attn.q_proj
2025-11-29 13:47:33 - auto_adpq.module - INFO - Quantizing layer: model.layers.0.self_attn.k_proj
2025-11-29 13:49:59 - auto_adpq.module - INFO - Quantizing layer: model.layers.0.self_attn.v_proj
2025-11-29 13:53:08 - auto_adpq.module - INFO - Quantizing layer: model.layers.0.self_attn.o_proj
2025-11-29 14:04:26 - auto_adpq.module - INFO - Quantizing layer: model.layers.0.mlp.gate_proj


KeyboardInterrupt: 

In [None]:
del model
gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()