# WIP Replicating AdpQ

From [Ghaffari et al 2024]

In [2]:
#!pip install huggingface_hub transformers

In [3]:

import functools
import glob
import os

import torch
from huggingface_hub import snapshot_download
from IPython.display import Javascript, display
from safetensors import safe_open

In [4]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
# Boiler plate code


def notify_when_done(show_popup=True, message="✅ Task finished!"):
    """
    Decorator that notifies (via pop-up and/or sound) when a function finishes running.
    
    Args:
        show_popup (bool): Whether to show a pop-up message.
        play_sound (bool): Whether to play a notification sound.
        message (str): Message to show in the pop-up.
    """
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            error = False
            try:
                return func(*args, **kwargs)
            except Exception as e:
                display(Javascript(f'alert("❌ An error occurred: {str(e)}")'))
                error = True
                raise e
            finally:
                if show_popup and not error:
                    display(Javascript(f'alert("{message}")'))
        return wrapper
    return decorator

# Load Hugging Face token from token.txt if it exists
token_folder = glob.glob(".env")
if token_folder:
    with open(token_folder[0], "r") as f:
        token = f.read().strip()
    hf_token = token
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = token

@notify_when_done(show_popup=True, message="✅ Download complete!")
def download_model(list_of_models, output_dir="./"):
    for model_id in list_of_models:

        local_folder = os.path.join(output_dir, model_id)

        os.makedirs(local_folder, exist_ok=True)

        print(f"Downloading model files for {model_id} to {local_folder}...")

        # --- 2. Download the files ---
        snapshot_download(
            repo_id=model_id,
            local_dir=local_folder,
            ignore_patterns=["*.bin", "*.py", "*.md"],
        )

        print("\nDownload complete!")
        print(f"Model files are saved in: {os.path.abspath(local_folder)}")

## Download weights

In [6]:
baseline_llm = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
to_quantize_llm = "meta-llama/Llama-3.1-8B-Instruct"

#download_model([baseline_llm, to_quantize_llm], output_dir="./weights")

In [7]:
to_quantize_llm_path = "../MasterThesis/experiments/meta-llama/Meta-Llama-3.1-8B-Instruct-weights"
baseline_llm_path = "../MasterThesis/experiments/hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4-weights"

In [8]:
# download wikitext 2 dataset and create function to evaluate llms on it
from datasets import load_dataset

wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")
def evaluate_llm(model, tokenizer, dataset, max_length=512):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for sample in dataset:
        inputs = tokenizer(sample['text'], return_tensors='pt', truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            total_loss += loss.item() * inputs['input_ids'].size(1)
            total_tokens += inputs['input_ids'].size(1)

    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

AdpQ is based on the adaptive lasso algorithm with the following formula

$$
\arg \min_{\hat W} ||WX - \hat W X||_2^2 + \lambda \mathbb{D}_{KL} (f_W || f_{\hat W})
$$

In [9]:
# Hyper parameters for AdpQ
group_size = 64
alpha_outlier = 0.08

In [10]:
print(wikitext['test'])

Dataset({
    features: ['text'],
    num_rows: 4358
})


In [11]:
def load_weight_safetensors(model_path, weight, device='cpu'):
    weight_files = glob.glob(f"{model_path}/*.safetensors")

    for file_path in weight_files:
        with safe_open(file_path, framework="pt", device="cpu") as f:

            # Iterate over all the tensor names (keys) in the file
            for tensor_name in f.keys():
                print(tensor_name)
                # Load ONLY the current tensor into CPU RAM
                if weight in tensor_name:
                    tensor = f.get_tensor(tensor_name)
                    return tensor.float().numpy()
    raise ValueError(f"Weight {weight} not found in any safetensors file in {model_path}")

In [12]:
weight = load_weight_safetensors(to_quantize_llm_path, "model.layers.0.self_attn.q_proj.weight")

model.embed_tokens.weight
model.layers.0.input_layernorm.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.post_attention_layernorm.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.q_proj.weight


In [13]:
weight.shape

(4096, 4096)

In [14]:
4096/group_size

64.0