# 1. Installation

In [1]:
!pip install --upgrade pip
!pip install llmcompressor

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.3
[0mCollecting llmcompressor
  Downloading llmcompressor-0.9.0.1-py3-none-any.whl.metadata (12 kB)
Collecting transformers<=4.57.3,>=4.54.0 (from llmcompressor)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets<=4.4.1,>=4.0.0 (from llmcompressor)
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-ml-py<=13.590.44,>=12.560.30 (from llmcompressor)
  Downloading nvidia_ml_py-13.590.44-py3-none-any.whl.metadata (9.8 kB)
Collecting pyarrow>=21.0.0 (from datasets<=4.4.1,>=4.0.0->l

In [4]:
from huggingface_hub import login

!pip install ipywidgets

login()     # Your huggingface access token.

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 2. Compress Model

## 2.1. Quantization

See [Quantization Schemes](https://docs.vllm.ai/projects/llm-compressor/en/0.8.1/guides/compression_schemes/) for choosing quantization method.

EX1). `INT W8A8` quantization with SmoothQuant and GPTQ

In [None]:
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot


recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]),
]


SAVE_DIR="TinyLlama-1.1B-Chat-v1.0-INT8"

oneshot(
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    dataset="open_platypus",
    recipe=recipe,
    output_dir=SAVE_DIR,
    max_seq_length=2048,
    num_calibration_samples=512,
)

In [None]:
# Inference quantized model via vLLM
from vllm import LLM, SamplingParams


model_path = "./TinyLlama-1.1B-Chat-v1.0-INT8"
# model_path = SAVE_DIR

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    # enforce_eager=True      # 호환성 모드 켜기 (필요시)
)

sampling_params = SamplingParams(max_tokens=256)

outputs = model.generate("Sky is blue and Apple is ", sampling_params)

for output in outputs:
    print("Answer: ", output.outputs[0].text)

INFO 01-23 05:25:10 [utils.py:263] non-default args: {'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': './TinyLlama-1.1B-Chat-v1.0-INT8'}
INFO 01-23 05:25:10 [model.py:530] Resolved architecture: LlamaForCausalLM
INFO 01-23 05:25:10 [model.py:1545] Using max model len 2048
INFO 01-23 05:25:12 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-23 05:25:12 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-23 05:25:12 [vllm.py:637] Disabling NCCL for DP synchronization when using async scheduling.
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:12 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='./TinyLlama-1.1B-Chat-v1.0-INT8', speculative_config=None, tokenizer='./TinyLlama-1.1B-Chat-v1.0-INT8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_par

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:13 [default_loader.py:291] Loading weights took 0.20 seconds
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:14 [gpu_model_runner.py:3905] Model loading took 1.15 GiB memory and 0.463936 seconds
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:18 [backends.py:644] Using cache directory: /root/.cache/vllm/torch_compile_cache/c0332d605c/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:18 [backends.py:704] Dynamo bytecode transform time: 4.53 s
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:20 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 0.332 s
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:20 [monitor.py:34] torch.compile takes 4.86 s in total
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:21 [gpu_worker.py:358] Available KV cache memory: 14.92 GiB
[0;36m(EngineCore_DP0

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 55.67it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 54.45it/s]


[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:23 [gpu_model_runner.py:4856] Graph capturing finished in 2 secs, took 0.44 GiB
[0;36m(EngineCore_DP0 pid=166254)[0;0m INFO 01-23 05:25:23 [core.py:273] init engine (profile, create kv cache, warmup model) took 9.93 seconds
INFO 01-23 05:25:24 [llm.py:347] Supported tasks: ['generate']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


2. And Pikachu is yellow and Google is red

So yes, those are some great questions and they present opportunities for social storytelling.


결과: 2.2Gb -> 1.15Gb로 원본대비 약 52% 압축

EX2). `INT W8A8` quantization with AWQ

In [1]:

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation


def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }


# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


# Select model and load it.
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=47)
ds = ds.map(preprocess)

# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
    ),
]

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

Tokenizing:   0%|          | 0/256 [00:00<?, ? examples/s]

2026-01-23T04:54:08.990687+0000 | reset | INFO - Compression lifecycle reset
2026-01-23T04:54:08.991631+0000 | from_modifiers | INFO - Creating recipe from modifiers
2026-01-23T04:54:09.005492+0000 | on_initialize | INFO - No AWQModifier.mappings provided, inferring from model...
2026-01-23T04:54:09.017583+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2026-01-23T04:54:09.017875+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AWQModifier`


Preparing cache: 100%|██████████| 256/256 [00:00<00:00, 2480.15it/s]
(1/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 142.24it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.17s/it]
(1/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 433.47it/s]
(2/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 164.96it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.14s/it]
(2/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 511.43it/s]
(3/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 162.85it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.18s/it]
(3/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 490.70it/s]
(4/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 164.67it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.17s/it]
(4/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 497.41it/s]
(5/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 199.34it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.15s/it

2026-01-23T04:57:16.070184+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers





2026-01-23T04:57:16.103031+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 112it [00:01, 72.37it/s]


('Llama-3.2-1B-Instruct-awq-asym/tokenizer_config.json',
 'Llama-3.2-1B-Instruct-awq-asym/special_tokens_map.json',
 'Llama-3.2-1B-Instruct-awq-asym/chat_template.jinja',
 'Llama-3.2-1B-Instruct-awq-asym/tokenizer.json')

In [None]:
from vllm import LLM, SamplingParams


model_path = SAVE_DIR

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    # enforce_eager=True      # 호환성 모드 켜기 (필요시)
)

prompt = "3 + 5 is "

sampling_params = SamplingParams(max_tokens=256)

outputs = model.generate(prompt, sampling_params)

for output in outputs:
    print(f"Prompt: {prompt}")
    print(f"Answer: ", output.outputs[0].text)

INFO 01-23 04:58:04 [utils.py:263] non-default args: {'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': 'Llama-3.2-1B-Instruct-awq-asym'}
INFO 01-23 04:58:04 [model.py:530] Resolved architecture: LlamaForCausalLM
INFO 01-23 04:58:04 [model.py:1545] Using max model len 131072
INFO 01-23 04:58:04 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-23 04:58:04 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-23 04:58:04 [vllm.py:637] Disabling NCCL for DP synchronization when using async scheduling.


The tokenizer you are loading from 'Llama-3.2-1B-Instruct-awq-asym' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:09 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='Llama-3.2-1B-Instruct-awq-asym', speculative_config=None, tokenizer='Llama-3.2-1B-Instruct-awq-asym', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=Non

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.65it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.65it/s]
[0;36m(EngineCore_DP0 pid=150803)[0;0m 


[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:10 [default_loader.py:291] Loading weights took 0.18 seconds
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:11 [gpu_model_runner.py:3905] Model loading took 0.99 GiB memory and 0.513573 seconds
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:14 [backends.py:644] Using cache directory: /root/.cache/vllm/torch_compile_cache/38e4a3dd15/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:14 [backends.py:704] Dynamo bytecode transform time: 3.01 s
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:19 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:22 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 5.32 s
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:22 [monitor.py:34] torch.compile takes 8.33 s in total
[0;36m(EngineCore_DP0 pid=150803)[0;

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 62.71it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 65.56it/s]


[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:25 [gpu_model_runner.py:4856] Graph capturing finished in 2 secs, took 0.31 GiB
[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:25 [core.py:273] init engine (profile, create kv cache, warmup model) took 14.18 seconds


[0;36m(EngineCore_DP0 pid=150803)[0;0m The tokenizer you are loading from 'Llama-3.2-1B-Instruct-awq-asym' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


[0;36m(EngineCore_DP0 pid=150803)[0;0m INFO 01-23 04:58:26 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-23 04:58:26 [llm.py:347] Supported tasks: ['generate']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

 *ahem* I quote the opening line of the Angels & Demons novel...
"...legends and myths that had still begun to take shape in human minds as mode of keeping order. My own is rooted in science, which delivers the unpleasant truth, and, if used, it can provide incredible breakthroughs."
Are they human, or supernatural?
What connects them to the angels?
And what drives them?
What are the consequences of their combined efforts?
In this tale, do dark secrets differentiate a heroic sacrifice until it is insisted as the bànẹp items called The symbols that can blindly aver goKhistence and Passion remain basically identicalMen perce Di property presence wit.PMIN sau-ca = Enter guests activita
Cas trao cases workstationmarkedboth mại poate Consortium Ccccort Hosorian alternatively breasts sister v master tim iter thoroughly blindness Memade promote Zai des asked hom mixed lurking terms net export mature administration authority ord well mi durch Ack Plot carry business architecture_http Marbleเนc

결과: 2.47Gb -> 1.45Gb로 원본대비 약 42% 압축됨

-----

EX3). `INT W4A16` quantization with GPTQ 

In [2]:
# Load Model
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Load Calibration Dataset
from datasets import load_dataset

NUM_CALIBRATION_SAMPLES=512
MAX_SEQUENCE_LENGTH=2048

# Load dataset.
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

# Preprocess the data into the format the model is trained with.
def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False,)}
ds = ds.map(preprocess)

# Tokenize the data (be careful with bos tokens - we need add_special_tokens=False since the chat_template already added it).
def tokenize(sample):
    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)

In [None]:
# Apply quantization with GPTQ recipe
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier

# Configure the quantization algorithm to run.
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])

# Save dir name
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"

# Apply quantization.
oneshot(
    model=model, dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    output_dir=SAVE_DIR,    # Tokenizer automatically saved together
)

# # Save to disk compressed.
# model.save_pretrained(SAVE_DIR, save_compressed=True)
# tokenizer.save_pretrained(SAVE_DIR)

2026-01-23T06:27:34.653254+0000 | reset | INFO - Compression lifecycle reset
2026-01-23T06:27:34.654706+0000 | from_modifiers | INFO - Creating recipe from modifiers
2026-01-23T06:27:34.680429+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2026-01-23T06:27:34.680893+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `GPTQModifier`


Preparing cache: 100%|██████████| 512/512 [00:00<00:00, 1331.08it/s]
(1/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.27it/s]

2026-01-23T06:27:45.966714+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.q_proj using 512 samples





2026-01-23T06:27:47.040387+0000 | compress | METRIC - time 1.07s
2026-01-23T06:27:47.040829+0000 | compress | METRIC - error 918.60
2026-01-23T06:27:47.041510+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:27:47.041805+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:27:47.042267+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.k_proj using 512 samples
2026-01-23T06:27:47.921058+0000 | compress | METRIC - time 0.88s
2026-01-23T06:27:47.921498+0000 | compress | METRIC - error 466.25
2026-01-23T06:27:47.921929+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:27:47.922168+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:27:47.922571+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.v_proj using 512 samples
2026-01-23T06:27:48.776163+0000 | compress | METRIC - time 0.85s
2026-01-23T06:27:48.776801+0000 | compress | METRIC 

(1/29): Propagating: 100%|██████████| 512/512 [00:04<00:00, 115.47it/s]
(2/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.34it/s]

2026-01-23T06:28:09.516919+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.q_proj using 512 samples





2026-01-23T06:28:10.451052+0000 | compress | METRIC - time 0.93s
2026-01-23T06:28:10.451524+0000 | compress | METRIC - error 885.66
2026-01-23T06:28:10.451971+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:10.452200+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:28:10.452653+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.k_proj using 512 samples
2026-01-23T06:28:11.347424+0000 | compress | METRIC - time 0.89s
2026-01-23T06:28:11.347994+0000 | compress | METRIC - error 516.55
2026-01-23T06:28:11.348588+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:11.348871+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:28:11.349472+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.v_proj using 512 samples
2026-01-23T06:28:12.215454+0000 | compress | METRIC - time 0.87s
2026-01-23T06:28:12.215933+0000 | compress | METRIC 

(2/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 165.19it/s]
(3/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.21it/s]

2026-01-23T06:28:31.428589+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.q_proj using 512 samples





2026-01-23T06:28:32.328590+0000 | compress | METRIC - time 0.90s
2026-01-23T06:28:32.329208+0000 | compress | METRIC - error 4856.82
2026-01-23T06:28:32.329734+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:32.329992+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:28:32.330451+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.k_proj using 512 samples
2026-01-23T06:28:33.193273+0000 | compress | METRIC - time 0.86s
2026-01-23T06:28:33.193805+0000 | compress | METRIC - error 2897.00
2026-01-23T06:28:33.194224+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:33.194482+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:28:33.194932+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.v_proj using 512 samples
2026-01-23T06:28:34.054758+0000 | compress | METRIC - time 0.86s
2026-01-23T06:28:34.055364+0000 | compress | METRI

(3/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.92it/s]
(4/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.14it/s]

2026-01-23T06:28:53.071684+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.q_proj using 512 samples





2026-01-23T06:28:54.061854+0000 | compress | METRIC - time 0.99s
2026-01-23T06:28:54.062514+0000 | compress | METRIC - error 4001.33
2026-01-23T06:28:54.063003+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:54.063240+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:28:54.063663+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.k_proj using 512 samples
2026-01-23T06:28:55.018297+0000 | compress | METRIC - time 0.95s
2026-01-23T06:28:55.018880+0000 | compress | METRIC - error 2201.40
2026-01-23T06:28:55.019343+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:28:55.019631+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:28:55.020053+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.v_proj using 512 samples
2026-01-23T06:28:55.892453+0000 | compress | METRIC - time 0.87s
2026-01-23T06:28:55.893001+0000 | compress | METRI

(4/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 165.86it/s]
(5/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.08it/s]

2026-01-23T06:29:15.308730+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.q_proj using 512 samples





2026-01-23T06:29:16.303201+0000 | compress | METRIC - time 0.99s
2026-01-23T06:29:16.303857+0000 | compress | METRIC - error 3753.86
2026-01-23T06:29:16.304307+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:29:16.304664+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:29:16.305250+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.k_proj using 512 samples
2026-01-23T06:29:17.255933+0000 | compress | METRIC - time 0.95s
2026-01-23T06:29:17.256499+0000 | compress | METRIC - error 1966.59
2026-01-23T06:29:17.257097+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:29:17.257350+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:29:17.257974+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.v_proj using 512 samples
2026-01-23T06:29:18.132592+0000 | compress | METRIC - time 0.87s
2026-01-23T06:29:18.133238+0000 | compress | METRI

(5/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 166.18it/s]
(6/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 48.00it/s]

2026-01-23T06:29:37.251574+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.q_proj using 512 samples





2026-01-23T06:29:38.140880+0000 | compress | METRIC - time 0.89s
2026-01-23T06:29:38.141441+0000 | compress | METRIC - error 5228.10
2026-01-23T06:29:38.141831+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:29:38.142076+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:29:38.142584+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.k_proj using 512 samples
2026-01-23T06:29:39.009484+0000 | compress | METRIC - time 0.87s
2026-01-23T06:29:39.010028+0000 | compress | METRIC - error 3119.31
2026-01-23T06:29:39.010550+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:29:39.010808+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:29:39.011271+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.v_proj using 512 samples
2026-01-23T06:29:39.875572+0000 | compress | METRIC - time 0.86s
2026-01-23T06:29:39.876152+0000 | compress | METRI

(6/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.10it/s]
(7/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.98it/s]

2026-01-23T06:29:59.222240+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.q_proj using 512 samples





2026-01-23T06:30:00.110912+0000 | compress | METRIC - time 0.89s
2026-01-23T06:30:00.111436+0000 | compress | METRIC - error 4709.93
2026-01-23T06:30:00.112038+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:00.112306+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:30:00.112939+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.k_proj using 512 samples
2026-01-23T06:30:00.988811+0000 | compress | METRIC - time 0.88s
2026-01-23T06:30:00.989422+0000 | compress | METRIC - error 2508.02
2026-01-23T06:30:00.989849+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:00.990078+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:30:00.990503+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.v_proj using 512 samples
2026-01-23T06:30:01.890024+0000 | compress | METRIC - time 0.90s
2026-01-23T06:30:01.890601+0000 | compress | METRI

(7/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.93it/s]
(8/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.92it/s]

2026-01-23T06:30:21.035067+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.q_proj using 512 samples





2026-01-23T06:30:21.941951+0000 | compress | METRIC - time 0.91s
2026-01-23T06:30:21.942664+0000 | compress | METRIC - error 4177.26
2026-01-23T06:30:21.943094+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:21.943455+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:30:21.943873+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.k_proj using 512 samples
2026-01-23T06:30:22.812885+0000 | compress | METRIC - time 0.87s
2026-01-23T06:30:22.813439+0000 | compress | METRIC - error 2474.13
2026-01-23T06:30:22.813882+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:22.814145+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:30:22.814631+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.v_proj using 512 samples
2026-01-23T06:30:23.678140+0000 | compress | METRIC - time 0.86s
2026-01-23T06:30:23.678759+0000 | compress | METRI

(8/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.33it/s]
(9/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.84it/s]

2026-01-23T06:30:42.812536+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.q_proj using 512 samples





2026-01-23T06:30:43.716222+0000 | compress | METRIC - time 0.90s
2026-01-23T06:30:43.716812+0000 | compress | METRIC - error 5234.50
2026-01-23T06:30:43.717216+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:43.717468+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:30:43.717900+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.k_proj using 512 samples
2026-01-23T06:30:44.625583+0000 | compress | METRIC - time 0.91s
2026-01-23T06:30:44.626208+0000 | compress | METRIC - error 3125.55
2026-01-23T06:30:44.626908+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:30:44.627166+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:30:44.627740+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.v_proj using 512 samples
2026-01-23T06:30:45.581673+0000 | compress | METRIC - time 0.95s
2026-01-23T06:30:45.582279+0000 | compress | METRI

(9/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.94it/s]
(10/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.85it/s]

2026-01-23T06:31:04.994252+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.q_proj using 512 samples





2026-01-23T06:31:05.935270+0000 | compress | METRIC - time 0.94s
2026-01-23T06:31:05.935886+0000 | compress | METRIC - error 5109.09
2026-01-23T06:31:05.936334+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:05.936571+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:31:05.937130+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.k_proj using 512 samples
2026-01-23T06:31:06.886521+0000 | compress | METRIC - time 0.95s
2026-01-23T06:31:06.887141+0000 | compress | METRIC - error 3000.73
2026-01-23T06:31:06.887631+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:06.887904+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:31:06.888334+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.v_proj using 512 samples
2026-01-23T06:31:07.837553+0000 | compress | METRIC - time 0.95s
2026-01-23T06:31:07.838089+0000 | compress | METRI

(10/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.46it/s]
(11/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.83it/s]

2026-01-23T06:31:27.317145+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.q_proj using 512 samples





2026-01-23T06:31:28.298305+0000 | compress | METRIC - time 0.98s
2026-01-23T06:31:28.298869+0000 | compress | METRIC - error 5113.89
2026-01-23T06:31:28.299299+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:28.299553+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:31:28.300088+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.k_proj using 512 samples
2026-01-23T06:31:29.168715+0000 | compress | METRIC - time 0.87s
2026-01-23T06:31:29.169262+0000 | compress | METRIC - error 3134.33
2026-01-23T06:31:29.169741+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:29.169983+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:31:29.170482+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.v_proj using 512 samples
2026-01-23T06:31:30.121738+0000 | compress | METRIC - time 0.95s
2026-01-23T06:31:30.122239+0000 | compress | MET

(11/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.48it/s]
(12/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.84it/s]

2026-01-23T06:31:49.488595+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.q_proj using 512 samples





2026-01-23T06:31:50.437303+0000 | compress | METRIC - time 0.95s
2026-01-23T06:31:50.437876+0000 | compress | METRIC - error 4460.34
2026-01-23T06:31:50.438385+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:50.438656+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:31:50.439119+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.k_proj using 512 samples
2026-01-23T06:31:51.347386+0000 | compress | METRIC - time 0.91s
2026-01-23T06:31:51.347978+0000 | compress | METRIC - error 2493.98
2026-01-23T06:31:51.348481+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:31:51.348772+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:31:51.349210+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.v_proj using 512 samples
2026-01-23T06:31:52.261268+0000 | compress | METRIC - time 0.91s
2026-01-23T06:31:52.261755+0000 | compress | MET

(12/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.90it/s]
(13/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.80it/s]

2026-01-23T06:32:11.584685+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.q_proj using 512 samples





2026-01-23T06:32:12.492606+0000 | compress | METRIC - time 0.91s
2026-01-23T06:32:12.493137+0000 | compress | METRIC - error 6070.62
2026-01-23T06:32:12.493601+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:12.493866+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:32:12.494308+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.k_proj using 512 samples
2026-01-23T06:32:13.368208+0000 | compress | METRIC - time 0.87s
2026-01-23T06:32:13.368840+0000 | compress | METRIC - error 3595.77
2026-01-23T06:32:13.369309+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:13.369697+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:32:13.370146+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.v_proj using 512 samples
2026-01-23T06:32:14.263173+0000 | compress | METRIC - time 0.89s
2026-01-23T06:32:14.263706+0000 | compress | MET

(13/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.75it/s]
(14/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.77it/s]

2026-01-23T06:32:33.632586+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.q_proj using 512 samples





2026-01-23T06:32:34.540244+0000 | compress | METRIC - time 0.91s
2026-01-23T06:32:34.540798+0000 | compress | METRIC - error 6361.19
2026-01-23T06:32:34.541213+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:34.541465+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:32:34.541901+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.k_proj using 512 samples
2026-01-23T06:32:35.409339+0000 | compress | METRIC - time 0.87s
2026-01-23T06:32:35.409829+0000 | compress | METRIC - error 4046.89
2026-01-23T06:32:35.410272+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:35.410576+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:32:35.411094+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.v_proj using 512 samples
2026-01-23T06:32:36.345293+0000 | compress | METRIC - time 0.93s
2026-01-23T06:32:36.345870+0000 | compress | MET

(14/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.02it/s]
(15/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.77it/s]

2026-01-23T06:32:55.674629+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.q_proj using 512 samples





2026-01-23T06:32:56.681123+0000 | compress | METRIC - time 1.01s
2026-01-23T06:32:56.681730+0000 | compress | METRIC - error 6983.91
2026-01-23T06:32:56.682215+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:56.682590+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:32:56.683019+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.k_proj using 512 samples
2026-01-23T06:32:57.670975+0000 | compress | METRIC - time 0.99s
2026-01-23T06:32:57.671552+0000 | compress | METRIC - error 3310.89
2026-01-23T06:32:57.672006+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:32:57.672259+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:32:57.672841+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.v_proj using 512 samples
2026-01-23T06:32:58.588775+0000 | compress | METRIC - time 0.92s
2026-01-23T06:32:58.589374+0000 | compress | MET

(15/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.61it/s]
(16/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.79it/s]

2026-01-23T06:33:18.106104+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.q_proj using 512 samples





2026-01-23T06:33:19.053666+0000 | compress | METRIC - time 0.95s
2026-01-23T06:33:19.054271+0000 | compress | METRIC - error 7261.36
2026-01-23T06:33:19.054699+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:33:19.054934+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:33:19.055336+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.k_proj using 512 samples
2026-01-23T06:33:20.002174+0000 | compress | METRIC - time 0.95s
2026-01-23T06:33:20.002882+0000 | compress | METRIC - error 3772.13
2026-01-23T06:33:20.003325+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:33:20.003559+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:33:20.003973+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.v_proj using 512 samples
2026-01-23T06:33:20.956689+0000 | compress | METRIC - time 0.95s
2026-01-23T06:33:20.957303+0000 | compress | MET

(16/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.33it/s]
(17/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.75it/s]

2026-01-23T06:33:40.395038+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.q_proj using 512 samples





2026-01-23T06:33:41.301080+0000 | compress | METRIC - time 0.91s
2026-01-23T06:33:41.301623+0000 | compress | METRIC - error 7588.73
2026-01-23T06:33:41.302058+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:33:41.302305+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:33:41.302776+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.k_proj using 512 samples
2026-01-23T06:33:42.170176+0000 | compress | METRIC - time 0.87s
2026-01-23T06:33:42.170662+0000 | compress | METRIC - error 4240.04
2026-01-23T06:33:42.171085+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:33:42.171334+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:33:42.171791+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.v_proj using 512 samples
2026-01-23T06:33:43.041409+0000 | compress | METRIC - time 0.87s
2026-01-23T06:33:43.042013+0000 | compress | MET

(17/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.30it/s]
(18/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.73it/s]

2026-01-23T06:34:02.205672+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.q_proj using 512 samples





2026-01-23T06:34:03.114473+0000 | compress | METRIC - time 0.91s
2026-01-23T06:34:03.115042+0000 | compress | METRIC - error 7360.72
2026-01-23T06:34:03.115573+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:03.115871+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:34:03.116318+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.k_proj using 512 samples
2026-01-23T06:34:04.111544+0000 | compress | METRIC - time 0.99s
2026-01-23T06:34:04.112137+0000 | compress | METRIC - error 3976.41
2026-01-23T06:34:04.112669+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:04.113043+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:34:04.113481+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.v_proj using 512 samples
2026-01-23T06:34:05.073687+0000 | compress | METRIC - time 0.96s
2026-01-23T06:34:05.074292+0000 | compress | MET

(18/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.37it/s]
(19/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.73it/s]

2026-01-23T06:34:24.350410+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.q_proj using 512 samples





2026-01-23T06:34:25.270986+0000 | compress | METRIC - time 0.92s
2026-01-23T06:34:25.271543+0000 | compress | METRIC - error 8071.42
2026-01-23T06:34:25.271960+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:25.272179+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:34:25.272684+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.k_proj using 512 samples
2026-01-23T06:34:26.144777+0000 | compress | METRIC - time 0.87s
2026-01-23T06:34:26.145431+0000 | compress | METRIC - error 4455.64
2026-01-23T06:34:26.146031+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:26.146323+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:34:26.146962+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.v_proj using 512 samples
2026-01-23T06:34:27.042392+0000 | compress | METRIC - time 0.90s
2026-01-23T06:34:27.042916+0000 | compress | MET

(19/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.61it/s]
(20/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.71it/s]

2026-01-23T06:34:46.368254+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.q_proj using 512 samples





2026-01-23T06:34:47.352659+0000 | compress | METRIC - time 0.98s
2026-01-23T06:34:47.353254+0000 | compress | METRIC - error 7542.83
2026-01-23T06:34:47.353927+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:47.354216+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:34:47.354756+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.k_proj using 512 samples
2026-01-23T06:34:48.295142+0000 | compress | METRIC - time 0.94s
2026-01-23T06:34:48.295792+0000 | compress | METRIC - error 4344.13
2026-01-23T06:34:48.296207+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:34:48.296457+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:34:48.296915+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.v_proj using 512 samples
2026-01-23T06:34:49.162292+0000 | compress | METRIC - time 0.87s
2026-01-23T06:34:49.162855+0000 | compress | MET

(20/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.32it/s]
(21/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.72it/s]

2026-01-23T06:35:08.597887+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.q_proj using 512 samples





2026-01-23T06:35:09.545906+0000 | compress | METRIC - time 0.95s
2026-01-23T06:35:09.546467+0000 | compress | METRIC - error 7857.23
2026-01-23T06:35:09.546919+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:09.547174+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:35:09.547653+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.k_proj using 512 samples
2026-01-23T06:35:10.456211+0000 | compress | METRIC - time 0.91s
2026-01-23T06:35:10.456782+0000 | compress | METRIC - error 4696.80
2026-01-23T06:35:10.457225+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:10.457487+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:35:10.457955+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.v_proj using 512 samples
2026-01-23T06:35:11.367084+0000 | compress | METRIC - time 0.91s
2026-01-23T06:35:11.367645+0000 | compress | MET

(21/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.80it/s]
(22/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.72it/s]

2026-01-23T06:35:30.869084+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.q_proj using 512 samples





2026-01-23T06:35:31.789492+0000 | compress | METRIC - time 0.92s
2026-01-23T06:35:31.790033+0000 | compress | METRIC - error 7752.72
2026-01-23T06:35:31.790492+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:31.790766+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:35:31.791217+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.k_proj using 512 samples
2026-01-23T06:35:32.740740+0000 | compress | METRIC - time 0.95s
2026-01-23T06:35:32.741462+0000 | compress | METRIC - error 4578.25
2026-01-23T06:35:32.741987+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:32.742352+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:35:32.742777+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.v_proj using 512 samples
2026-01-23T06:35:33.691040+0000 | compress | METRIC - time 0.95s
2026-01-23T06:35:33.691592+0000 | compress | MET

(22/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.20it/s]
(23/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.71it/s]

2026-01-23T06:35:53.132664+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.q_proj using 512 samples





2026-01-23T06:35:54.081266+0000 | compress | METRIC - time 0.95s
2026-01-23T06:35:54.081813+0000 | compress | METRIC - error 7924.70
2026-01-23T06:35:54.082216+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:54.082598+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:35:54.082991+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.k_proj using 512 samples
2026-01-23T06:35:54.997685+0000 | compress | METRIC - time 0.91s
2026-01-23T06:35:54.998289+0000 | compress | METRIC - error 4512.09
2026-01-23T06:35:54.998723+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:35:54.999037+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:35:54.999409+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.v_proj using 512 samples
2026-01-23T06:35:55.913652+0000 | compress | METRIC - time 0.91s
2026-01-23T06:35:55.914207+0000 | compress | MET

(23/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 168.69it/s]
(24/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.71it/s]

2026-01-23T06:36:15.346299+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.q_proj using 512 samples





2026-01-23T06:36:16.256139+0000 | compress | METRIC - time 0.91s
2026-01-23T06:36:16.256715+0000 | compress | METRIC - error 7725.31
2026-01-23T06:36:16.257144+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:36:16.257400+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:36:16.257838+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.k_proj using 512 samples
2026-01-23T06:36:17.129875+0000 | compress | METRIC - time 0.87s
2026-01-23T06:36:17.130391+0000 | compress | METRIC - error 4723.49
2026-01-23T06:36:17.130849+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:36:17.131108+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:36:17.131592+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.v_proj using 512 samples
2026-01-23T06:36:18.002368+0000 | compress | METRIC - time 0.87s
2026-01-23T06:36:18.002930+0000 | compress | MET

(24/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.00it/s]
(25/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.70it/s]

2026-01-23T06:36:37.477509+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.q_proj using 512 samples





2026-01-23T06:36:38.385870+0000 | compress | METRIC - time 0.91s
2026-01-23T06:36:38.386453+0000 | compress | METRIC - error 8377.51
2026-01-23T06:36:38.386904+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:36:38.387147+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:36:38.387569+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.k_proj using 512 samples
2026-01-23T06:36:39.272581+0000 | compress | METRIC - time 0.88s
2026-01-23T06:36:39.273118+0000 | compress | METRIC - error 5162.77
2026-01-23T06:36:39.273552+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:36:39.273867+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:36:39.274239+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.v_proj using 512 samples
2026-01-23T06:36:40.180562+0000 | compress | METRIC - time 0.91s
2026-01-23T06:36:40.181200+0000 | compress | MET

(25/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 168.47it/s]
(26/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.70it/s]

2026-01-23T06:36:59.511915+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.q_proj using 512 samples





2026-01-23T06:37:00.420959+0000 | compress | METRIC - time 0.91s
2026-01-23T06:37:00.421611+0000 | compress | METRIC - error 8701.06
2026-01-23T06:37:00.422020+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:00.422261+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:37:00.422719+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.k_proj using 512 samples
2026-01-23T06:37:01.287776+0000 | compress | METRIC - time 0.86s
2026-01-23T06:37:01.288296+0000 | compress | METRIC - error 4485.78
2026-01-23T06:37:01.288742+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:01.288977+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:37:01.289397+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.v_proj using 512 samples
2026-01-23T06:37:02.160302+0000 | compress | METRIC - time 0.87s
2026-01-23T06:37:02.161060+0000 | compress | MET

(26/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 169.33it/s]
(27/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.69it/s]

2026-01-23T06:37:21.528246+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.q_proj using 512 samples





2026-01-23T06:37:22.480015+0000 | compress | METRIC - time 0.95s
2026-01-23T06:37:22.480644+0000 | compress | METRIC - error 7487.92
2026-01-23T06:37:22.481104+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:22.481444+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:37:22.481849+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.k_proj using 512 samples
2026-01-23T06:37:23.376929+0000 | compress | METRIC - time 0.89s
2026-01-23T06:37:23.377567+0000 | compress | METRIC - error 4801.38
2026-01-23T06:37:23.378003+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:23.378255+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:37:23.378724+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.v_proj using 512 samples
2026-01-23T06:37:24.326907+0000 | compress | METRIC - time 0.95s
2026-01-23T06:37:24.327564+0000 | compress | MET

(27/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 170.03it/s]
(28/29): Calibrating: 100%|██████████| 512/512 [00:10<00:00, 47.67it/s]

2026-01-23T06:37:43.795586+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.q_proj using 512 samples





2026-01-23T06:37:44.706367+0000 | compress | METRIC - time 0.91s
2026-01-23T06:37:44.706972+0000 | compress | METRIC - error 6126.65
2026-01-23T06:37:44.707407+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:44.707677+0000 | compress | METRIC - Compressed module size: 19.095552 MB
2026-01-23T06:37:44.708105+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.k_proj using 512 samples
2026-01-23T06:37:45.661212+0000 | compress | METRIC - time 0.95s
2026-01-23T06:37:45.661841+0000 | compress | METRIC - error 3450.79
2026-01-23T06:37:45.662323+0000 | compress | METRIC - GPU 0 | usage: 12.24% | total memory: 25 GB
2026-01-23T06:37:45.662659+0000 | compress | METRIC - Compressed module size: 6.365184 MB
2026-01-23T06:37:45.663247+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.v_proj using 512 samples
2026-01-23T06:37:46.612705+0000 | compress | METRIC - time 0.95s
2026-01-23T06:37:46.613397+0000 | compress | MET

(28/29): Propagating: 100%|██████████| 512/512 [00:03<00:00, 168.96it/s]
(29/29): Calibrating: 100%|██████████| 512/512 [00:00<00:00, 1279.18it/s]
(29/29): Propagating: 100%|██████████| 512/512 [00:00<00:00, 1444.29it/s]

2026-01-23T06:37:56.158784+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers





2026-01-23T06:37:56.188614+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 196it [00:05, 36.26it/s]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (

결과: 6.43Gb -> 2.82Gb로 약 44% 압축

In [None]:
# If you need.
!pip install -U lm_eval       

In [5]:
# our Quantized model eval
!lm_eval --model vllm \
  --model_args pretrained="./Llama-3.2-3B-Instruct-W4A16-G128",add_bos_token=true \
  --tasks gsm8k \
  --num_fewshot 5 \
  --limit 250 \
  --batch_size 'auto'

[2026-01-23 06:38:44] INFO __main__.py:465: Selected Tasks: ['gsm8k']
        not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).
[2026-01-23 06:38:44] INFO evaluator.py:202: Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
[2026-01-23 06:38:44] INFO evaluator.py:240: Initializing vllm model, with arguments: {'pretrained': './Llama-3.2-3B-Instruct-W4A16-G128', 'add_bos_token': True}
[32mINFO[0m [90m01-23 06:38:44[0m [90m[utils.py:263][0m non-default args: {'seed': 1234, 'disable_log_stats': True, 'model': './Llama-3.2-3B-Instruct-W4A16-G128'}
[32mINFO[0m [90m01-23 06:38:44[0m [90m[model.py:530][0m Resolved architecture: LlamaForCausalLM
[32mINFO[0m [90m01-23 06:38:44[0m [90m[model.py:1545][0m Using max model len 131072
[32mINFO[0m [90m01-23 06:38:45[0m [90m[scheduler.py:229][0m Chunked prefill is enabled with max_num_batched_tokens=8192.
[32mINFO

In [None]:
# Original model eval
!lm_eval --model vllm \
  --model_args pretrained="meta-llama/Llama-3.2-3B-Instruct",add_bos_token=true,dtype=auto,max_model_len=4096 \
  --tasks gsm8k \
  --num_fewshot 5 \
  --limit 250 \
  --batch_size 'auto'

**Quantized Model**

|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  | 0.64|±  |0.0304|
|     |       |strict-match    |     5|exact_match|↑  | 0.54|±  |0.0316|

**Original Model**

|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.680|±  |0.0296|
|     |       |strict-match    |     5|exact_match|↑  |0.608|±  |0.0309|

-----


## 2.2. Pruning

In [5]:
# 2:4 sparse pruning with/without FP8 quantization
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation


# Configuration
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048
QUANT_ENABLE = False

def preprocess(example):
    """Preprocess dataset examples."""
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}


def tokenize(sample):
    """Tokenize dataset examples."""
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


def get_recipe(fp8_enabled):
    """
    Generate the compression recipe and save directory based on the FP8 flag.
    """
    
    base_recipe = [
        SparseGPTModifier(
            sparsity=0.5,
            mask_structure="2:4",
            targets=[r"re:model.layers.\d*$"],
        )
    ]
    save_dir = MODEL_ID.rstrip("/").split("/")[-1] + "2of4-sparse"

    if fp8_enabled:
        base_recipe.append(
            QuantizationModifier(
                targets=["Linear"],
                ignore=["lm_head"],
                scheme="FP8_DYNAMIC",
            )
        )
        save_dir = (
            MODEL_ID.rstrip("/").split("/")[-1] + "2of4-W8A8-FP8-Dynamic-Per-Token"
        )

        # check that asymmetric quantization is not being used
        q_scheme = base_recipe[1].scheme
        if not isinstance(q_scheme, str) and not q_scheme["weights"].symmetric:
            raise ValueError(
                "Asymmetric quantization with 2of4 sparsity is not supported by vLLM. "
                "Please use symmetric quantization"
            )

    return base_recipe, save_dir

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype='auto')
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Load and preprocess dataset
ds = load_dataset(
    DATASET_ID, 
    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"
).shuffle(seed=47)
ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)

# Get compression recipe and save directory
recipe, save_dir = get_recipe(QUANT_ENABLE)

# Apply compression
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Validate the compressed model
print("\n========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
    model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n")

# Save compressed model and tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
# Inference pruned model via vLLM
from vllm import LLM, SamplingParams


model_path = r"Llama-3.2-3B-Instruct2of4-sparse"

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    enforce_eager=True,      # 호환성 모드 켜기 (필요시)
    dtype='auto'
)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=256
    )

prompt = "Hello, My name is:"

outputs = model.generate(prompt, sampling_params)

for output in outputs:
    print(output.outputs[0].text)