# System Level Quantiziation

In [16]:
import torch
import time
import matplotlib.pyplot as plt
from vllm import LLM, SamplingParams
import gc

# --- CONFIGURATION ---
MODEL_NAME = "gpt2-large"
GENERATION_LENGTH = 1024
TEMPERATURE = 0.8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- VRAM MONITORING HELPER ---
def get_vram_usage():
    """Returns the current GPU memory usage in GB."""
    if DEVICE == "cuda":
        return torch.cuda.memory_allocated(0) / (1024**3)
    return 0

def run_inference_and_measure(model_path, label):
    """Loads a model, runs inference with a standard FP16 cache, and measures VRAM."""
    print(f"--- Running test for: {label} ---")
    
    llm = LLM(
        model=model_path,
        kv_cache_dtype="auto", # Use the default, which will be FP16 on your GPU
        dtype="auto",
        gpu_memory_utilization=0.90,
        enforce_eager=True 
    )

    vram_before = get_vram_usage()
    print(f"VRAM before generation (base model): {vram_before:.3f} GB")

    prompt = "The primary purpose of using Key-Value cache quantization in Large Language Models is to achieve"
    sampling_params = SamplingParams(temperature=TEMPERATURE, max_tokens=GENERATION_LENGTH)

    outputs = llm.generate(prompt, sampling_params)
    
    vram_after = get_vram_usage()
    print(f"VRAM after generation (model + cache): {vram_after:.3f} GB")

    num_tokens = len(outputs[0].outputs[0].token_ids)
    vram_for_cache = vram_after - vram_before
    print(f"Generated {num_tokens} tokens.")
    print(f"VRAM used by FP16 KV Cache: {vram_for_cache:.3f} GB")
    print("-" * 30)
    
    del llm
    gc.collect()
    torch.cuda.empty_cache()
    
    return vram_before, vram_for_cache, num_tokens

def main():
    if DEVICE == "cpu":
        print("❌ This benchmark requires a CUDA-enabled GPU to measure VRAM.")
        return

    # --- Step 1: Run inference once to get baseline FP16 cache usage ---
    fp16_vram_base, fp16_vram_cache, num_tokens = run_inference_and_measure(
        MODEL_NAME, 
        "Baseline (FP16 KV Cache)"
    )

    # --- Step 2: Simulate the INT8 cache usage ---
    # An INT8 cache (1 byte per value) is 50% the size of an FP16 cache (2 bytes per value).
    # This calculation simulates the memory usage if we had used INT8.
    int8_cache_simulated = fp16_vram_cache / 2.0
    memory_saved = fp16_vram_cache - int8_cache_simulated
    
    print("\n--- Simulation Summary ---")
    print(f"Sequence length: {num_tokens} tokens")
    print(f"Measured FP16 KV Cache size:  {fp16_vram_cache:.3f} GB")
    print(f"Simulated INT8 KV Cache size: {int8_cache_simulated:.3f} GB")
    print(f"✅ VRAM saved on cache: {memory_saved:.3f} GB (a 50% reduction)")

    # --- Step 3: Plot the real vs. simulated results ---
    labels = ['Baseline (FP16 Cache)', 'Simulated (INT8 Cache)']
    cache_sizes = [fp16_vram_cache, int8_cache_simulated]
    base_model_size = [fp16_vram_base, fp16_vram_base] # Base model size is the same

    plt.figure(figsize=(10, 7))
    plt.bar(labels, base_model_size, label='Base Model VRAM', color='#4c72b0')
    plt.bar(labels, cache_sizes, bottom=base_model_size, label='KV Cache VRAM', color=['#c44e52', '#f5b041'])

    # Add text labels for total VRAM
    total_fp16 = fp16_vram_base + fp16_vram_cache
    total_simulated = fp16_vram_base + int8_cache_simulated
    plt.text(labels[0], total_fp16, f'Total: {total_fp16:.2f} GB', ha='center', va='bottom', fontsize=11)
    plt.text(labels[1], total_simulated, f'Total: {total_simulated:.2f} GB', ha='center', va='bottom', fontsize=11)

    plt.ylabel('Total VRAM Usage (GB)')
    plt.title(f'VRAM Usage: Real FP16 vs. Simulated INT8 KV Cache ({num_tokens} tokens)')
    plt.legend()
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

--- Running test for: Baseline (FP16 KV Cache) ---
INFO 10-03 19:37:34 [utils.py:328] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'model': 'gpt2-large'}
INFO 10-03 19:37:35 [__init__.py:742] Resolved architecture: GPT2LMHeadModel
INFO 10-03 19:37:36 [__init__.py:2764] Downcasting torch.float32 to torch.bfloat16.
INFO 10-03 19:37:36 [__init__.py:1815] Using max model len 1024
INFO 10-03 19:37:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 10-03 19:37:36 [__init__.py:3400] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=9439)[0;0m INFO 10-03 19:37:38 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=9439)[0;0m INFO 10-03 19:37:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='gpt2-large', speculative_config=None, tokenizer='gpt2-large', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False,

[1;36m(EngineCore_DP0 pid=9439)[0;0m Process EngineCore_DP0:
[1;36m(EngineCore_DP0 pid=9439)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=9439)[0;0m   File "/home/sriney/miniconda3/envs/hawai/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
[1;36m(EngineCore_DP0 pid=9439)[0;0m     self.run()
[1;36m(EngineCore_DP0 pid=9439)[0;0m   File "/home/sriney/miniconda3/envs/hawai/lib/python3.10/multiprocessing/process.py", line 108, in run
[1;36m(EngineCore_DP0 pid=9439)[0;0m     self._target(*self._args, **self._kwargs)
[1;36m(EngineCore_DP0 pid=9439)[0;0m   File "/home/sriney/miniconda3/envs/hawai/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 722, in run_engine_core
[1;36m(EngineCore_DP0 pid=9439)[0;0m     raise e
[1;36m(EngineCore_DP0 pid=9439)[0;0m   File "/home/sriney/miniconda3/envs/hawai/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
[1;36m(EngineCore_DP0 pid=9439)[0;0m     engine_

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}