# System Level Quantiziation

In [11]:
import torch
import time
import matplotlib.pyplot as plt
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# --- CONFIGURATION (Category 7: Quantization) ---
MODEL_NAME = "gpt2-large"
# FIX: Switched to a different quantization type (like AWQ or GPTQ) 
# which is generally better supported by vLLM's C++ backends than BitsAndBytes for older models.
QUANTIZATION_METHOD = "awq" 
GENERATION_LENGTH = 512 
TEMPERATURE = 0.8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- VRAM MONITORING HELPER (rest of the functions remain the same) ---
def get_vram_usage():
    """Returns the current GPU memory usage in GB."""
    if DEVICE == "cuda":
        torch.cuda.synchronize() 
        return torch.cuda.memory_allocated(0) / (1024**3)
    return 0

def run_quantized_inference(llm_instance, prompt, length, label):
    """Executes inference and records VRAM/time per token."""
    
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    
    vram_history = []
    token_indices = []
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
    
    hf_model = llm_instance.model 
    
    past_key_values = None
    generated_tokens_count = input_ids.size(1)
    
    print(f"Starting inference for: {label}")
    
    with torch.no_grad():
        for i in range(length):
            outputs = hf_model(input_ids=input_ids[:, -1:], past_key_values=past_key_values, use_cache=True)
            
            past_key_values = outputs.past_key_values
            
            next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(-1)
            
            input_ids = torch.cat([input_ids, next_token_id], dim=-1)
            generated_tokens_count += 1
            
            vram_history.append(get_vram_usage())
            token_indices.append(generated_tokens_count)
            
            if generated_tokens_count >= length:
                 break
    
    return token_indices, vram_history, None 


def main():
    if DEVICE == "cpu":
        print("Cannot run benchmark. Please use a CUDA-enabled GPU.")
        return

    # 1. LOAD THE MODEL (QUANTIZED)
    print(f"--- Loading {MODEL_NAME} with {QUANTIZATION_METHOD} (8-bit/4-bit) ---")
    
    try:
        start_load = time.perf_counter()
        llm_quant = LLM(
            model=MODEL_NAME,
            quantization=QUANTIZATION_METHOD, 
            dtype="auto",
            gpu_memory_utilization=0.75,     
            enforce_eager=True               
        )
        load_time = time.perf_counter() - start_load
        print(f"✅ Engine loaded successfully in {load_time:.2f} seconds.")
        
    except Exception as e:
        print(f"\n❌ ERROR: Failed to load {MODEL_NAME} with quantization method '{QUANTIZATION_METHOD}'.")
        print("You must use a quantization method that has a dedicated, supported backend in your vLLM installation for GPT2-Large.")
        print(f"Detail: {e}")
        return

    # 2. RUN QUANTIZED TEST AND SIMULATE BASELINE
    prompt = "The primary purpose of using Key-Value cache quantization in LLMs is to achieve"
    
    quant_indices, quant_vram, _ = run_quantized_inference(
        llm_quant, 
        prompt, 
        GENERATION_LENGTH, 
        f"Quantized ({QUANTIZATION_METHOD})"
    )

    # --- SIMULATE FP16 BASELINE ---
    if len(quant_vram) > 1 and quant_indices[-1] > quant_indices[0]:
        vram_increase_quant_per_token = (quant_vram[-1] - quant_vram[0]) / (quant_indices[-1] - quant_indices[0])
        SIMULATION_FACTOR = 3.0 
        vram_increase_fp16 = vram_increase_quant_per_token * SIMULATION_FACTOR
        
        fp16_vram_simulated = [
            quant_vram[0] + (vram_increase_fp16 * (idx - quant_indices[0])) 
            for idx in quant_indices
        ]
    else:
        print("Not enough data points collected for VRAM plotting.")
        return
        
    # 3. PLOT THE RESULTS
    plt.figure(figsize=(10, 6))
    
    plt.plot(quant_indices, quant_vram, label=f'Quantized KV Cache ({QUANTIZATION_METHOD})', color='orange', linewidth=2)
    plt.plot(quant_indices, fp16_vram_simulated, label='Simulated FP16 Baseline (Full Cache)', color='blue', linestyle='--')

    plt.xlabel('Generated Token Number (Sequence Length)')
    plt.ylabel('Total VRAM Usage (GB)')
    plt.title(f'VRAM Footprint: Quantized vs. Simulated Baseline on {MODEL_NAME}')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.6)
    
    if len(quant_vram) > 0:
        max_saved_gb = fp16_vram_simulated[-1] - quant_vram[-1]
        plt.text(quant_indices[-1] * 0.6, (quant_vram[-1] + fp16_vram_simulated[-1]) / 2, 
                 f'Memory Saved: {max_saved_gb:.2f} GB', 
                 fontsize=10, color='green', bbox=dict(facecolor='yellow', alpha=0.5))

    plt.ylim(bottom=min(quant_vram) * 0.95 if quant_vram else 0)
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

--- Loading gpt2-large with awq (8-bit/4-bit) ---
INFO 10-03 19:25:35 [utils.py:328] non-default args: {'gpu_memory_utilization': 0.75, 'disable_log_stats': True, 'quantization': 'awq', 'enforce_eager': True, 'model': 'gpt2-large'}


INFO 10-03 19:25:36 [__init__.py:742] Resolved architecture: GPT2LMHeadModel
INFO 10-03 19:25:37 [__init__.py:2764] Downcasting torch.float32 to torch.bfloat16.
INFO 10-03 19:25:37 [__init__.py:1815] Using max model len 1024
INFO 10-03 19:25:37 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.

❌ ERROR: Failed to load gpt2-large with quantization method 'awq'.
You must use a quantization method that has a dedicated, supported backend in your vLLM installation for GPT2-Large.
Detail: 1 validation error for VllmConfig
  Value error, Cannot find the config file for awq [type=value_error, input_value=ArgsKwargs((), {'model_co...additional_config': {}}), input_type=ArgsKwargs]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error
