# 07: Use Data in LLM Quantization

## 1. Imports

In [1]:
import gc
import copy
import random
# import os
# CACHE_PATH = '<path_to_cache>'
# os.environ["TORCH_HOME"] = CACHE_PATH
# os.environ["HF_HOME"] = CACHE_PATH
# os.environ["HUGGINGFACE_HUB_CACHE"] = CACHE_PATH
# os.environ["HUGGINGFACE_ASSETS_CACHE"] = CACHE_PATH
# os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH

import matplotlib.pyplot as plt
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
# from huggingface_hub import login; login(token="<hf_token>")

import pruna_pro
from pruna_pro import SmashConfig
from pruna_pro import smash
from pruna.data.pruna_datamodule import PrunaDataModule
from pruna.data.utils import split_train_into_train_val_test
from pruna.evaluation.evaluation_agent import EvaluationAgent
from pruna.evaluation.metrics.metric_elapsed_time import ElapsedTimeMetric
from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
from pruna.evaluation.metrics.metric_energy import EnergyMetric
from pruna.evaluation.metrics.metric_memory import GPUMemoryMetric
from pruna.evaluation.metrics.metric_model_architecture import ModelArchitectureMetric
from pruna.evaluation.task import Task

## 2. Utils

The utils functions help for:
- Load from a list of (small) models.  eel free to try other models until the GPU memory is not enough!
- Make plots.
- Iterate over evaluation and model configurations.

In [2]:
model_ids = [
    # "facebook/opt-125m",
    # "facebook/opt-350m", 
    # "facebook/opt-1.3b",
    # "facebook/opt-2.7b",
    # "meta-llama/Llama-3.2-1B",
    # "meta-llama/Llama-3.2-1B-Instruct",
    # "meta-llama/Llama-3.2-3B-Instruct",
    # "google/gemma-3-1b-it",
    # "google/gemma-3-4b-it",
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # "microsoft/Phi-4-mini-instruct",
    # "HuggingFaceTB/SmolLM-135M",
    # "HuggingFaceTB/SmolLM-135M-instruct",
    # "HuggingFaceTB/SmolLM-360M", 
    # "HuggingFaceTB/SmolLM-360M-Instruct",
    # "HuggingFaceTB/SmolLM-1.7B",
    # "HuggingFaceTB/SmolLM-1.7B-Instruct",
    # "HuggingFaceTB/SmolLM2-135M",
    # "HuggingFaceTB/SmolLM2-135M-Instruct",
    # "HuggingFaceTB/SmolLM2-360M",
    # "HuggingFaceTB/SmolLM2-360M-Instruct", 
    # "HuggingFaceTB/SmolLM2-1.7B",
    # "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    "PleIAs/Pleias-350m-Preview",
    # "PleIAs/Pleias-Pico",
    # "PleIAs/Pleias-1.2b-Preview",
    # "PleIAs/Pleias-Nano",
    # "PleIAs/Pleias-3b-Preview",
]

# Load model and tokenizer for first model in list
model_id = model_ids[0]
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

## 3. Use data in LLM quantization

We recommend to checkout the [Pruna documentation](https://pruna.readthedocs.io/en/latest/index.html) for access to AI efficiency functions.

### 3.1 Evaluate the base model quality

**Implementation task:**
- Evaluate the base model quality with the perplexity metric on the WikiText dataset.
- Repeat the experiment with other LLMs and/or datasets.

In [3]:
def smash_evaluate_perplexity(model, tokenizer, smash_config, dataset="WikiText"):
    ### To Complete ###
    ### End of To Complete ###
    
    return results

In [6]:
### To Complete ###
### End of To Complete ###

INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), max_seq_len=None)...
INFO - Using

{'perplexity_y_gt': 34.41499710083008}


### 3.2 Quantize LLM without data

**Implementation task:**
- Quantize the LLM with Quanto and evaluate its quality with the perplexity metric on the WikiText dataset. Quanto performs naive linear quantization without using any data.

**Questions:**
- Is there a performance drop after quantization?

In [7]:
### To Complete ###
### End of To Complete ###

INFO - Verifying Pruna token.
INFO - You have used 145 hours this month.
INFO - Starting quantizer quanto...
ERROR - Calibration requires a tokenizer and dataloader. Skipping calibration.
INFO - quantizer quanto was applied successfully.
INFO - You have used 145 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normaliz

{'perplexity_y_gt': 45.573577880859375}


### 3.3 Quantize LLM with in-distribution data

**Implementation task:**
- Quantize the LLM with GPTQ or AWQ and evaluate its quality with the perplexity metric on the WikiText dataset. GPTQ and AWQ perform linear quantization using data.

**Questions:**
- Does in-distribution data improve the quality of the quantized model?

In [8]:
### To Complete ###
### End of To Complete ###

INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provi

Quantizing model.layers blocks :   0%|          | 0/26 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
INFO - quantizer gptq was applied successfully.
INFO - You have used 146 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False

{'perplexity_y_gt': 37.480918884277344}


### 3.4 Quantize LLM with more/less in-distribution data

**Implementation task:**
- Quantize the LLM with GPTQ or AWQ and evaluate its quality with the perplexity metric on the WikiText dataset.

**Questions:**
- Does in-distribution data improve the quality of the quantized model?

In [9]:
### To Complete ###
### End of To Complete ###

INFO - Using max_seq_len of tokenizer: None
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), max_seq_len=None)...
Asking to truncate 

Quantizing model.layers blocks :   0%|          | 0/26 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

INFO - quantizer gptq was applied successfully.
INFO - You have used 147 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False,

{'perplexity_y_gt': 37.480918884277344}


### 3.5 Quantize LLM with random data

**Implementation task:**
- Quantize the LLM with GPTQ or AWQ and evaluate its quality with the perplexity metric on the WikiText dataset.

**Questions:**
- Does random data improve the quality of the quantized model?

In [10]:
### To Complete ###
### End of To Complete ###

INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provi

Quantizing model.layers blocks :   0%|          | 0/26 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

INFO - quantizer gptq was applied successfully.
INFO - You have used 147 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f967f63f490>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False,

{'perplexity_y_gt': 42.67584228515625}


### 3.5 Quantize LLM with out-of-disitribution data

**Implementation task:**
- Quantize the LLM with GPTQ or AWQ and evaluate its quality with the perplexity metric on the WikiText dataset.

**Questions:**
- Does random data improve the quality of the quantized model?

In [None]:
### To Complete ###
### End of To Complete ###

INFO - Loaded only training, splitting train 80/10/10 into train, validation and test...
INFO - Using max_seq_len of tokenizer: None
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7fbbda38edd0>, tokenizer=PreTrainedTokenizerFast(name_or_path='PleIAs/Pleias-350m-Preview', vocab_size=65536, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|end_of_text|>', 'eos_token': '<|end_of_text|>', 'unk_token': '[UNK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single