# 1. Installation

In [1]:
!pip install --upgrade pip
!pip install llmcompressor

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.3
[0mCollecting llmcompressor
  Downloading llmcompressor-0.9.0.1-py3-none-any.whl.metadata (12 kB)
Collecting transformers<=4.57.3,>=4.54.0 (from llmcompressor)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets<=4.4.1,>=4.0.0 (from llmcompressor)
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-ml-py<=13.590.44,>=12.560.30 (from llmcompressor)
  Downloading nvidia_ml_py-13.590.44-py3-none-any.whl.metadata (9.8 kB)
Collecting pyarrow>=21.0.0 (from datasets<=4.4.1,>=4.0.0->l

In [4]:
from huggingface_hub import login

!pip install ipywidgets

login()     # Your huggingface access token.

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 2. Compress Model

## 2.1. Quantization

See [Quantization Schemes](https://docs.vllm.ai/projects/llm-compressor/en/0.8.1/guides/compression_schemes/) for choosing quantization method.

In [None]:
# INT W8A8 quantization with SmoothQuant and GPTQ
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot


recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]),
]


SAVE_DIR="TinyLlama-1.1B-Chat-v1.0-INT4"

oneshot(
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    dataset="open_platypus",
    recipe=recipe,
    output_dir=SAVE_DIR,
    max_seq_length=2048,
    num_calibration_samples=512,
)

In [None]:
# Inference quantized model via vLLM
from vllm import LLM, SamplingParams


model_path = SAVE_DIR

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    # enforce_eager=True      # 호환성 모드 켜기 (필요시)
)

sampling_params = SamplingParams(max_tokens=256)

outputs = model.generate("What is machine learning?", sampling_params)

for output in outputs:
    print(output.outputs[0].text)

INFO 01-22 11:08:43 [utils.py:263] non-default args: {'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': './TinyLlama-1.1B-Chat-v1.0-INT8'}
INFO 01-22 11:08:43 [model.py:530] Resolved architecture: LlamaForCausalLM
INFO 01-22 11:08:43 [model.py:1545] Using max model len 2048
INFO 01-22 11:08:44 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-22 11:08:44 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-22 11:08:44 [vllm.py:637] Disabling NCCL for DP synchronization when using async scheduling.
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:44 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='./TinyLlama-1.1B-Chat-v1.0-INT8', speculative_config=None, tokenizer='./TinyLlama-1.1B-Chat-v1.0-INT8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parall

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:46 [default_loader.py:291] Loading weights took 0.21 seconds
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:46 [gpu_model_runner.py:3905] Model loading took 1.15 GiB memory and 0.460047 seconds
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:51 [backends.py:644] Using cache directory: /root/.cache/vllm/torch_compile_cache/c0332d605c/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:51 [backends.py:704] Dynamo bytecode transform time: 4.47 s
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:55 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:59 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 6.62 s
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:08:59 [monitor.py:34] torch.compile takes 11.09 s in total
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:09:00 

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 61.46it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 60.04it/s]


[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:09:03 [gpu_model_runner.py:4856] Graph capturing finished in 2 secs, took 0.44 GiB
[0;36m(EngineCore_DP0 pid=925)[0;0m INFO 01-22 11:09:03 [core.py:273] init engine (profile, create kv cache, warmup model) took 16.58 seconds
INFO 01-22 11:09:03 [llm.py:347] Supported tasks: ['generate']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


2.2 How does machine learning work in image classification?
2.3 How does machine learning perform better than human classification algorithms?
2.4 How is model selection and evaluation performed in automated machine learning systems?
2.5 What are the pros and cons of using machine learning algorithms for image classification?
3.1 How is data preprocessing used in machine learning algorithms?
3.2 What is the DL4J framework for data preprocessing in machine learning?
3.3 How does the KNN algorithm perform in data preprocessing and classification tasks?
3.4 How does the Naive Bayes algorithm perform in data preprocessing and classification tasks?
3.5 What sets Noah's Alternative framework apart from other machine learning frameworks for image classification?

Learning outcomes:
1. Understanding the definition of machine learning, its various types of algorithms, and their applications.
2. Understanding how machine learning works in image classification.
3. Identifying the role of preproc

In [2]:
# INT W8A8 quantization with SmoothQuant and AWQ
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation


def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }


# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


# Select model and load it.
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=47)
ds = ds.map(preprocess)

# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
    ),
]

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/256 [00:00<?, ? examples/s]

2026-01-22T15:03:06.293162+0000 | reset | INFO - Compression lifecycle reset
2026-01-22T15:03:06.294111+0000 | from_modifiers | INFO - Creating recipe from modifiers
2026-01-22T15:03:06.308228+0000 | on_initialize | INFO - No AWQModifier.mappings provided, inferring from model...
2026-01-22T15:03:06.319885+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2026-01-22T15:03:06.320172+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AWQModifier`


Preparing cache: 100%|██████████| 256/256 [00:00<00:00, 2613.20it/s]
(1/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 208.18it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.20s/it]
(1/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 453.96it/s]
(2/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 204.50it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.19s/it]
(2/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 492.74it/s]
(3/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 200.87it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.19s/it]
(3/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 507.73it/s]
(4/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 205.72it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.23s/it]
(4/17): Propagating: 100%|██████████| 256/256 [00:00<00:00, 510.65it/s]
(5/17): Calibrating: 100%|██████████| 256/256 [00:01<00:00, 206.62it/s]
Smoothing: 100%|██████████| 3/3 [00:09<00:00,  3.22s/it

KeyboardInterrupt: 

In [None]:
from vllm import LLM, SamplingParams


model_path = SAVE_DIR

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    # enforce_eager=True      # 호환성 모드 켜기 (필요시)
)

sampling_params = SamplingParams(max_tokens=256)

outputs = model.generate("Who Are you? Explain yourself.", sampling_params)

for output in outputs:
    print(output.outputs[0].text)

## 2.2. Pruning

In [5]:
# 2:4 sparse pruning with/without FP8 quantization
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation


# Configuration
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048
QUANT_ENABLE = False

def preprocess(example):
    """Preprocess dataset examples."""
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}


def tokenize(sample):
    """Tokenize dataset examples."""
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


def get_recipe(fp8_enabled):
    """
    Generate the compression recipe and save directory based on the FP8 flag.
    """
    
    base_recipe = [
        SparseGPTModifier(
            sparsity=0.5,
            mask_structure="2:4",
            targets=[r"re:model.layers.\d*$"],
        )
    ]
    save_dir = MODEL_ID.rstrip("/").split("/")[-1] + "2of4-sparse"

    if fp8_enabled:
        base_recipe.append(
            QuantizationModifier(
                targets=["Linear"],
                ignore=["lm_head"],
                scheme="FP8_DYNAMIC",
            )
        )
        save_dir = (
            MODEL_ID.rstrip("/").split("/")[-1] + "2of4-W8A8-FP8-Dynamic-Per-Token"
        )

        # check that asymmetric quantization is not being used
        q_scheme = base_recipe[1].scheme
        if not isinstance(q_scheme, str) and not q_scheme["weights"].symmetric:
            raise ValueError(
                "Asymmetric quantization with 2of4 sparsity is not supported by vLLM. "
                "Please use symmetric quantization"
            )

    return base_recipe, save_dir

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype='auto')
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Load and preprocess dataset
ds = load_dataset(
    DATASET_ID, 
    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"
).shuffle(seed=47)
ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)

# Get compression recipe and save directory
recipe, save_dir = get_recipe(QUANT_ENABLE)

# Apply compression
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Validate the compressed model
print("\n========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
    model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n")

# Save compressed model and tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
# Inference pruned model via vLLM
from vllm import LLM, SamplingParams


model_path = r"Llama-3.2-3B-Instruct2of4-sparse"

model = LLM(
    model=model_path,    
    gpu_memory_utilization=0.7, # GPU 메모리 70% 사용 (필요시 조절)
    tensor_parallel_size=1,   # GPU 1개 사용
    enforce_eager=True,      # 호환성 모드 켜기 (필요시)
    dtype='auto'
)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=256
    )

prompt = "Hello, My name is:"

outputs = model.generate(prompt, sampling_params)

for output in outputs:
    print(output.outputs[0].text)