In [1]:
import os
# 🛑 CRITICAL FIX 1: Set PyTorch CUDA memory configuration
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" 

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
from huggingface_hub import snapshot_download 
import copy # <-- ADDED IMPORT

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
model_id = "Qwen/Qwen1.5-MoE-A2.7B"       
gpu_device = "cuda:0"                     
cpu_device = "cpu"                         
dtype = torch.float16                      

# Use a conservative VRAM budget
max_memory = {
    0: "8GiB",        # Only 8 GB VRAM for the essential trunk/router
    "cpu": "300GiB"   # Ample CPU memory for offloaded weights
}

# ------------------------------------------------------------
# Step 1 & 2 (No changes)
# ------------------------------------------------------------
print(f"1. Loading config and creating model skeleton for {model_id}...")
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
with init_empty_weights():
    empty_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

print("2. Inferring device map (8GB VRAM limit enforced)...")
no_split = getattr(empty_model, "_no_split_modules", ["QwenBlock", "Block"])

device_map = infer_auto_device_map(
    empty_model,
    max_memory=max_memory, 
    no_split_module_classes=no_split,
)

for name in list(device_map.keys()):
    if "experts" in name:
        device_map[name] = cpu_device

# ------------------------------------------------------------
# Step 3 (No changes)
# ------------------------------------------------------------
print("3. Downloading checkpoint and dispatching weights...")
local_checkpoint_folder = snapshot_download(
    model_id,
    allow_patterns=["*.safetensors", "*.bin", "config.json", "*.json"]
)

model = load_checkpoint_and_dispatch(
    empty_model,
    checkpoint=local_checkpoint_folder, 
    device_map=device_map,
    no_split_module_classes=no_split,
    dtype=dtype,
    offload_folder=None,
)
print("   Model successfully loaded and dispatched across GPU and CPU RAM.")

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# ------------------------------------------------------------
# Step 4: Patch experts for dynamic CUDA computation (FIXED)
# ------------------------------------------------------------
print("4. Applying dynamic offloading patch to all MoE experts...")
def patch_expert_for_cuda_compute(expert: nn.Module, gpu_device="cuda:0"):
    # Store a reference to the original, non-patched forward method for the CUDA clone
    # This assumes the expert module is a standard nn.Module with a standard forward
    # and has not been hooked (since it was just loaded onto CPU RAM).
    
    # We must retrieve the UNBOUND method here to manually re-create the module class
    # The actual implementation of the expert is Qwen2MoeMLP in Qwen1.5-MoE
    original_expert_class = expert.__class__ 
    cuda_dev = torch.device(gpu_device)

    def wrapped_forward(*args, **kwargs):
        # 1. Move inputs to CUDA
        args = [a.to(cuda_dev) if torch.is_tensor(a) else a for a in args]
        kwargs = {k: (v.to(cuda_dev) if torch.is_tensor(v) else v) for k, v in kwargs.items()}

        # 2. CRITICAL FIX 3: Manually instantiate a new module on CUDA and load weights
        # Avoids the deep recursion issue caused by expert.to_empty() and accelerate hooks.
        
        # Get the config from the parent module if available, or just instantiate.
        # Assuming the expert constructor takes no arguments or is wrapped by config
        # This is a safe way to clone the structure.
        
        # NOTE: The expert module (Qwen2MoeMLP) is a simple structure, so this works:
        expert_gpu = original_expert_class(config=expert.config).to(cuda_dev).to(dtype)
        
        # Load weights from CPU (DRAM) into the CUDA clone
        expert_gpu.load_state_dict(expert.state_dict(), strict=True)
        
        # 3. Run forward on the temporary CUDA module
        with torch.no_grad():
            out = expert_gpu(*args, **kwargs)

        # 4. Free the CUDA clone to release VRAM immediately
        del expert_gpu
        torch.cuda.empty_cache()
        return out

    # We replace the forward of the CPU-resident module with our wrapper
    expert.forward = wrapped_forward

# Apply the patch to every expert ModuleList
for name, module in model.named_modules():
    if isinstance(module, nn.ModuleList) and "experts" in name:
        for expert in module:
            # Check if the expert is actually on the CPU before patching
            # The device map might have put small experts on CUDA.
            if next(expert.parameters()).device.type == cpu_device:
                patch_expert_for_cuda_compute(expert, gpu_device)

# ------------------------------------------------------------
# Step 5 & 6 (No changes)
# ------------------------------------------------------------
model.config.use_cache = False
model.eval()
try:
    model.config.attn_implementation = "flash_attention_2"
except Exception:
    pass

PROMPT = "Explain the concept of dynamic offloading in large language models in one paragraph."
print("\n" + "="*50)
print(f"6. Running generation for prompt: \n'{PROMPT}'")

inputs = tokenizer(PROMPT, return_tensors="pt").to(gpu_device)


1. Loading config and creating model skeleton for Qwen/Qwen1.5-MoE-A2.7B...
2. Inferring device map (8GB VRAM limit enforced)...
3. Downloading checkpoint and dispatching weights...


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?w/s]

  0%|          | 0/681 [00:00<?, ?w/s]

  0%|          | 0/677 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/681 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/677 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/681 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/677 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



   Model successfully loaded and dispatched across GPU and CPU RAM.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


4. Applying dynamic offloading patch to all MoE experts...

6. Running generation for prompt: 
'Explain the concept of dynamic offloading in large language models in one paragraph.'

--- GENERATION RESULT ---
Explain the concept of dynamic offloading in large language models in one paragraph. Dynamic offloading is a technique used in large language models to improve their performance by offloading some of the computational tasks onto specialized hardware, such as graphics processing units
Time taken in seconds: 82.98450660705566


In [2]:
import time
t2 = time.time()
start = time.time()
generated_ids = model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=True,
    top_p=0.95,
    temperature=0.9,
    repetition_penalty=1.05,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=False
)

print("\n--- GENERATION RESULT ---")
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
print("="*50)
t1 = time.time()
print("Time taken in seconds:",t1-t2)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



--- GENERATION RESULT ---
Explain the concept of dynamic offloading in large language models in one paragraph. Dynamic offloading in large language models refers to a technique where the computation is split between different devices, such as GPUs or TPUs, to reduce the amount of memory and computational resources required by the model. This technique allows for faster training and inference times, as well as improved accuracy, by allowing the model to train on
Time taken in seconds: 185.41313791275024


In [1]:
import transformers

In [10]:
import os
# 0) CUDA allocator safety: expandable segments reduce fragmentation on churny alloc/free.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
from huggingface_hub import snapshot_download
import time
from collections import OrderedDict

# -----------------------------
# Configuration
# -----------------------------
model_id   = "Qwen/Qwen1.5-MoE-A2.7B"
gpu_device = "cuda:0"
cpu_device = "cpu"
dtype      = torch.float16

# VRAM budget for trunk/router; experts will be cached separately via our LRU
max_memory = {
    0:   "8GiB",
    "cpu":"300GiB"
}

# Expert cache caps (choose one primary; the other is a guardrail)
GPU_CACHE_MAX_BYTES   = 2_000_000_000   # ~2.0 GB for expert weights total
GPU_CACHE_MAX_EXPERTS = 8               # or at most 8 experts resident on GPU

# -----------------------------
# 1) Load config & empty skeleton
# -----------------------------
print(f"1. Loading config and creating model skeleton for {model_id}...")
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

with init_empty_weights():
    empty_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

# -----------------------------
# 2) Build device map (experts to CPU)
# -----------------------------
print("2. Inferring device map (8GB VRAM limit enforced)...")
no_split = getattr(empty_model, "_no_split_modules", ["QwenBlock", "Block"])

device_map = infer_auto_device_map(
    empty_model,
    max_memory=max_memory,
    no_split_module_classes=no_split,
)
for name in list(device_map.keys()):
    if "experts" in name:
        device_map[name] = cpu_device

# -----------------------------
# 3) Download checkpoints & dispatch
# -----------------------------
print("3. Downloading checkpoint and dispatching weights...")
local_ckpt = snapshot_download(
    model_id,
    allow_patterns=["*.safetensors", "*.bin", "config.json", "*.json"]
)
model = load_checkpoint_and_dispatch(
    empty_model,
    checkpoint=local_ckpt,
    device_map=device_map,
    no_split_module_classes=no_split,
    dtype=dtype,
    offload_folder=None,
)
print("   Model successfully loaded and dispatched across GPU and CPU RAM.")

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# ======================================================================
#                   EXPERT GPU LRU CACHE (CORE UPGRADE)
# ======================================================================

def tensor_nbytes(t: torch.Tensor) -> int:
    # Handles fp16/bf16 int8 …; assumes dense tensors
    return t.numel() * t.element_size()

def module_param_bytes(mod: nn.Module) -> int:
    s = 0
    for p in mod.parameters(recurse=True):
        s += tensor_nbytes(p.data)
    for b in mod.buffers(recurse=True):
        s += tensor_nbytes(b.data)
    return s

def pin_state_dict(cpu_module: nn.Module):
    """
    Create a pinned-memory state_dict snapshot for faster H→D copies.
    (One-time cost per expert.)
    """
    pinned = {}
    with torch.no_grad():
        for k, v in cpu_module.state_dict().items():
            # ensure CPU & contiguous for consistent pinning
            t = v.detach().contiguous().to("cpu", copy=True)
            # pin if pageable (CUDA only supports pinning CPU tensors)
            try:
                t = t.pin_memory()
            except RuntimeError:
                # Some dtypes (e.g., bf16 on older builds) may not support pinning; fall back
                pass
            pinned[k] = t
    return pinned

class ExpertGPUCache:
    """
    LRU cache for CUDA-resident expert modules.
    - Capacity tracked by both bytes and count.
    - Uses pinned CPU state_dict for fast, async, non_blocking loads.
    - Optional prefetch using a dedicated CUDA stream.
    """
    def __init__(self, device:str, dtype:torch.dtype,
                 max_bytes:int = GPU_CACHE_MAX_BYTES,
                 max_count:int = GPU_CACHE_MAX_EXPERTS):
        self.device = torch.device(device)
        self.dtype  = dtype
        self.max_bytes = max_bytes
        self.max_count = max_count

        self._cache: OrderedDict[int, nn.Module] = OrderedDict()  # expert_id -> cuda module
        self._sizes: dict[int, int] = {}                          # bytes by expert
        self._pinned_sd: dict[int, dict] = {}                     # expert_id -> pinned state_dict
        self._bytes_total = 0

        # Single prefetch stream for async H→D copies
        self.prefetch_stream = torch.cuda.Stream(device=self.device)
        self.main_stream = torch.cuda.current_stream(device=self.device)

    def _ensure_capacity(self, need_bytes:int):
        # Evict LRU until enough room (by bytes & count)
        while (self._bytes_total + need_bytes > self.max_bytes) or \
              (len(self._cache) + 1 > self.max_count):
            # Pop LRU (first item)
            evict_id, evict_mod = self._cache.popitem(last=False)
            ev_sz = self._sizes.pop(evict_id, 0)
            self._bytes_total -= ev_sz
            # Free module params (no empty_cache; allocator will reuse)
            del evict_mod
        # Synchronization not strictly required here; allocator reuse is fine.

    def _build_cuda_module(self, cpu_expert: nn.Module) -> nn.Module:
        # Instantiate a fresh CUDA expert (structure only)
        # Prefer constructor with config; fallback to deepcopy if needed.
        cls = cpu_expert.__class__
        if hasattr(cpu_expert, "config"):
            m = cls(config=cpu_expert.config).to(self.device, dtype=self.dtype)
        else:
            # Fallback (rare): deepcopy structure, then move to CUDA & reset params
            import copy as _copy
            m = _copy.deepcopy(cpu_expert).to(self.device, dtype=self.dtype)
            # state_dict will be overwritten anyway
        return m

    def _load_weights_async(self, cuda_mod: nn.Module, expert_id: int):
        """
        Load from pinned CPU state_dict into CUDA module using non_blocking
        on the prefetch stream; then make main stream wait for completion.
        """
        if expert_id not in self._pinned_sd:
            # One-time pinned snapshot
            self._pinned_sd[expert_id] = pin_state_dict(cpu_expert_registry[expert_id])

        sd = self._pinned_sd[expert_id]
        # non_blocking H→D: do it key-by-key to avoid a large blocking call
        with torch.cuda.stream(self.prefetch_stream):
            for k, v_cpu in sd.items():
                # torch.nn.Module.load_state_dict does not expose non_blocking per key,
                # so we copy tensors into the parameter/buffer directly.
                # Map k to actual tensor ref in cuda_mod:
                ref = cuda_mod
                comps = k.split(".")
                for c in comps[:-1]:
                    ref = getattr(ref, c)
                leaf_name = comps[-1]
                # Select parameter/buffer tensor
                if hasattr(ref, leaf_name):
                    t = getattr(ref, leaf_name)
                    # Copy data into place
                    t.data.copy_(v_cpu, non_blocking=True)
                else:
                    # In unusual cases (e.g., missing buffer), fall back to set_
                    pass

        # Make main stream wait for prefetch stream completion to ensure data is ready
        self.main_stream.wait_stream(self.prefetch_stream)
        torch.cuda.current_stream(self.device).synchronize()  # keep it simple & safe

    def get(self, expert_id: int, cpu_expert: nn.Module) -> nn.Module:
        """
        Return a CUDA-resident expert ready for compute.
        If not cached, instantiate & async-load; maintain LRU.
        """
        # Fast path: cache hit → move to MRU
        if expert_id in self._cache:
            mod = self._cache.pop(expert_id)
            self._cache[expert_id] = mod  # MRU
            return mod

        # Miss → build, ensure capacity, async-load, account size, insert as MRU
        cuda_mod = self._build_cuda_module(cpu_expert)
        need_bytes = module_param_bytes(cuda_mod)

        self._ensure_capacity(need_bytes)
        self._load_weights_async(cuda_mod, expert_id)

        self._cache[expert_id] = cuda_mod  # MRU
        self._sizes[expert_id] = need_bytes
        self._bytes_total += need_bytes
        return cuda_mod

    # Optional: allow ahead-of-time prefetch (if you can obtain router top-k indices)
    def prefetch_experts(self, ids: list[int]):
        with torch.no_grad():
            for eid in ids:
                if eid in self._cache:
                    # Touch to mark MRU
                    mod = self._cache.pop(eid)
                    self._cache[eid] = mod
                else:
                    if eid not in cpu_expert_registry:
                        continue
                    cpu_mod = cpu_expert_registry[eid]
                    cuda_mod = self._build_cuda_module(cpu_mod)
                    need_bytes = module_param_bytes(cuda_mod)
                    self._ensure_capacity(need_bytes)
                    self._load_weights_async(cuda_mod, eid)
                    self._cache[eid] = cuda_mod
                    self._sizes[eid] = need_bytes
                    self._bytes_total += need_bytes

# ------------------------------------------------------------------------------
# Build a registry mapping expert_id -> CPU expert module for quick access
# Qwen MoE usually has ModuleList named like "...experts", each item is one expert.
# We'll assign a global counter id in discovery order.
# ------------------------------------------------------------------------------
cpu_expert_registry: dict[int, nn.Module] = {}
expert_name_map: dict[int, str] = {}  # id -> dotted path (debug)
eid = 0
for name, module in model.named_modules():
    if isinstance(module, nn.ModuleList) and "experts" in name:
        for idx, expert in enumerate(module):
            # only register experts that are on CPU
            try:
                dev = next(expert.parameters()).device.type
            except StopIteration:
                dev = cpu_device
            if dev == cpu_device:
                cpu_expert_registry[eid] = expert
                expert_name_map[eid] = f"{name}[{idx}]"
                eid += 1

print(f"Discovered {len(cpu_expert_registry)} CPU-resident experts for dynamic caching.")

# Create one global cache manager
expert_cache = ExpertGPUCache(device=gpu_device, dtype=dtype,
                              max_bytes=GPU_CACHE_MAX_BYTES, max_count=GPU_CACHE_MAX_EXPERTS)

# ======================================================================
#             PATCH EXPERT FORWARD TO USE THE LRU CACHE (NO EMPTY_CACHE)
# ======================================================================
def make_cached_forward(expert_id: int, cpu_expert: nn.Module):
    """
    Wrap CPU expert.forward so that it:
      - Moves inputs to GPU (non_blocking where possible)
      - Gets/creates a cached CUDA expert from LRU
      - Runs compute on GPU
      - Returns result on original device/dtype (match upstream expectations)
    """
    orig_fwd = cpu_expert.forward  # not used directly but kept for safety

    def wrapped_forward(*args, **kwargs):
        # Detect target device for inputs
        cuda_dev = torch.device(gpu_device)

        # Move tensor args/kwargs to GPU non_blocking, keep non-tensors as-is
        def to_cuda(x):
            if torch.is_tensor(x):
                # If x is already on GPU, return as-is; else move non_blocking if possible.
                return x.to(cuda_dev, non_blocking=True)
            return x

        args_cuda = tuple(to_cuda(a) for a in args)
        kwargs_cuda = {k: to_cuda(v) for k, v in kwargs.items()}

        # Get CUDA expert from LRU cache (instantiates+loads asynchronously on miss)
        cuda_expert = expert_cache.get(expert_id, cpu_expert)

        # Compute (no grad during inference). If you need grad, remove this context.
        with torch.no_grad():
            out = cuda_expert(*args_cuda, **kwargs_cuda)

        return out

    return wrapped_forward

# Apply wrappers to each CPU expert
patched = 0
for eid, cpu_expert in cpu_expert_registry.items():
    cpu_expert.forward = make_cached_forward(eid, cpu_expert)
    patched += 1
print(f"Patched {patched} experts with persistent CUDA-cache wrapper.")

# ======================================================================
#                     Model Gen Settings (unchanged)
# ======================================================================
model.config.use_cache = False  # keep disabled; experts are transiently-GPU
model.eval()
try:
    model.config.attn_implementation = "flash_attention_2"
except Exception:
    pass

# ======================================================================
#                              Demo
# ======================================================================
PROMPT = "Explain the concept of dynamic offloading in large language models in one paragraph."
print("\n" + "="*72)
print(f"Running generation for prompt:\n{PROMPT}\n")

inputs = tokenizer(PROMPT, return_tensors="pt").to(gpu_device)

# (Optional) If you can peek router top-k ids, call expert_cache.prefetch_experts(ids) here.

start = time.time()
generated_ids = model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=True,
    top_p=0.95,
    temperature=0.9,
    repetition_penalty=1.05,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=False
)
end = time.time()

print("\n--- GENERATION RESULT ---")
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
print(f"\nTime: {end-start:.2f}s")
print("="*72)


1. Loading config and creating model skeleton for Qwen/Qwen1.5-MoE-A2.7B...
2. Inferring device map (8GB VRAM limit enforced)...
3. Downloading checkpoint and dispatching weights...


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?w/s]

  0%|          | 0/681 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/681 [00:00<?, ?w/s]

  0%|          | 0/677 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/677 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/7 [00:00<?, ?w/s]

Some parameters are on the meta device because they were offloaded to the cpu.


   Model successfully loaded and dispatched across GPU and CPU RAM.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Discovered 0 CPU-resident experts for dynamic caching.
Patched 0 experts with persistent CUDA-cache wrapper.

Running generation for prompt:
Explain the concept of dynamic offloading in large language models in one paragraph.


--- GENERATION RESULT ---
Explain the concept of dynamic offloading in large language models in one paragraph. Additionally, provide a code snippet that demonstrates the implementation of dynamic offloading in a Python library for large language models.

Dynamic offloading is a technique used to improve the performance of large language models by dynamically offloading portions of the computation to specialized hardware, such as GPUs or TPUs. It allows the model to benefit from

Time: 188.66s


In [1]:
import os, gc, time
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
from huggingface_hub import snapshot_download

MODEL_ID   = "Qwen/Qwen1.5-MoE-A2.7B"
DTYPE      = torch.float16
GPU        = "cuda:0"
CPU        = "cpu"

# ---------------- Benchmark knobs ----------------
TARGET_CTX_TOKENS   = 1024     # input context length
MAX_NEW_TOKENS      = 256      # number of tokens to decode
SEARCH_START_BS     = 1        # batch size lower bound
SEARCH_MAX_BS_HINT  = 8        # initial upper bound for batch search

MAX_MEMORY = {0: "8GiB", "cpu": "300GiB"}

def vram_mb():
    return (torch.cuda.memory_allocated() / 1024**2,
            torch.cuda.memory_reserved()  / 1024**2)

def make_inputs(tokenizer, batch_size:int, device:str, target_ctx:int):
    seed_sentence = "In large language models, the KV cache must remain on the GPU for low-latency decoding. "
    toks = tokenizer(seed_sentence, return_tensors="pt").input_ids[0]
    reps = max(1, (target_ctx // toks.numel()) + 1)
    long_ids = toks.repeat(reps)[:target_ctx].unsqueeze(0)
    input_ids = long_ids.repeat(batch_size, 1)
    attn = torch.ones_like(input_ids)
    return {"input_ids": input_ids.to(device), "attention_mask": attn.to(device)}

def build_model(case:str):
    print(f"\n==> Building model for {case}")
    if case == "A_auto":
        # Use accelerate's auto map
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            device_map="auto",
            torch_dtype=DTYPE
        )
    else:  # Case B, experts on CPU
        config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
        with init_empty_weights():
            empty = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

        no_split = getattr(empty, "_no_split_modules", ["QwenBlock", "Block"])
        device_map = infer_auto_device_map(
            empty,
            max_memory=MAX_MEMORY,
            no_split_module_classes=no_split
        )
        for name in list(device_map.keys()):
            if "experts" in name:
                device_map[name] = CPU

        local_ckpt = snapshot_download(
            MODEL_ID, allow_patterns=["*.safetensors","*.bin","config.json","*.json"]
        )
        model = load_checkpoint_and_dispatch(
            empty,
            checkpoint=local_ckpt,
            device_map=device_map,
            no_split_module_classes=no_split,
            dtype=DTYPE,
            offload_folder=None,
        )

    tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    model.config.use_cache = True
    try: model.config.attn_implementation = "flash_attention_2"
    except Exception: pass
    model.eval()
    return model, tok

def try_one(model, tokenizer, batch_size:int, device:str):
    gc.collect()
    torch.cuda.empty_cache()
    inputs = make_inputs(tokenizer, batch_size, device, TARGET_CTX_TOKENS)
    torch.cuda.synchronize()
    start = time.time()
    with torch.no_grad():
        _ = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )
    torch.cuda.synchronize()
    end = time.time()
    return end - start

def find_max_batch(model, tokenizer, device:str, start_bs:int, max_hint:int):
    def feasible(bs:int):
        try:
            _ = try_one(model, tokenizer, bs, device)
            return True
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                return False
            raise

    lo, hi = start_bs, max_hint
    best = lo
    while feasible(hi):
        best = hi
        hi *= 2
        if hi > 128: break

    L, R = best+1, hi
    while L <= R:
        mid = (L + R)//2
        if feasible(mid):
            best = mid
            L = mid+1
        else:
            R = mid-1
    return best

def benchmark_case(case_name, case_flag):
    model, tok = build_model(case_flag)
    alloc, reserv = vram_mb()
    print(f"[{case_name}] VRAM after load: alloc={alloc:.1f} MB, reserved={reserv:.1f} MB")

    max_bs = find_max_batch(model, tok, GPU, SEARCH_START_BS, SEARCH_MAX_BS_HINT)
    print(f"[{case_name}] Max feasible batch size = {max_bs}")

    secs = try_one(model, tok, max_bs, GPU)
    total_tokens = max_bs * MAX_NEW_TOKENS
    tps = total_tokens / secs
    print(f"[{case_name}] {total_tokens} tokens in {secs:.2f}s → {tps:.2f} tok/s")

    alloc2, reserv2 = vram_mb()
    print(f"[{case_name}] VRAM after run: alloc={alloc2:.1f} MB, reserved={reserv2:.1f} MB")

    del model; del tok; gc.collect(); torch.cuda.empty_cache()
    return {"max_batch": max_bs, "tps": tps, "vram": alloc}

if __name__ == "__main__":
    torch.cuda.set_device(GPU)

    resA = benchmark_case("CASE A (auto map)", "A_auto")
    resB = benchmark_case("CASE B (experts on CPU)", "B_cpu")

    print("\n================ SUMMARY ================")
    print(f"Case A (auto):  max_batch={resA['max_batch']}, TPS={resA['tps']:.2f}, VRAM@load={resA['vram']:.1f} MB")
    print(f"Case B (CPU):   max_batch={resB['max_batch']}, TPS={resB['tps']:.2f}, VRAM@load={resB['vram']:.1f} MB")
    if resA['max_batch'] > 0:
        print(f"Batch capacity gain = {resB['max_batch']/resA['max_batch']:.2f}×")



==> Building model for A_auto


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


[CASE A (auto map)] VRAM after load: alloc=20182.2 MB, reserved=20186.0 MB
[CASE A (auto map)] Max feasible batch size = 12
[CASE A (auto map)] 3072 tokens in 84.44s → 36.38 tok/s
[CASE A (auto map)] VRAM after run: alloc=20190.3 MB, reserved=23866.0 MB

==> Building model for B_cpu


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?w/s]

  0%|          | 0/681 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/677 [00:00<?, ?w/s]

  0%|          | 0/681 [00:00<?, ?w/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Some parameters are on the meta device because they were offloaded to the cpu.


[CASE B (experts on CPU)] VRAM after load: alloc=6042.9 MB, reserved=6082.0 MB
[CASE B (experts on CPU)] Max feasible batch size = 70


OutOfMemoryError: CUDA out of memory. Tried to allocate 770.00 MiB. GPU 0 has a total capacity of 23.66 GiB of which 598.94 MiB is free. Process 762405 has 23.07 GiB memory in use. Of the allocated memory 22.65 GiB is allocated by PyTorch, and 108.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:

# Kill any leftover processes holding VRAM (replace <pid>):
!sudo kill -9 <pid>

# Optional: reset/persistence off (if allowed; may require root and no active procs)
!sudo nvidia-smi -pm 0
!sudo nvidia-smi --gpu-reset -i 0   # CAREFUL: only if no important jobs running

# If you run from Jupyter/Colab/notebook, restart the kernel/process too.


/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `sudo kill -9 <pid>'
Disabled persistence mode for GPU 00000000:43:00.0.
All done.
GPU Reset couldn't run because GPU 00000000:43:00.0 is the primary GPU.


In [3]:
# Example: use GPU 0 only (change to the one with most free memory)
!export CUDA_VISIBLE_DEVICES=0


In [13]:
import os
import gc
import time
import subprocess
import torch
!pip install lmcache

from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig 
# Environment variables for LMCache:
os.environ["LMCACHE_LOCAL_CPU"] = "True"
# ...
# vLLM code explicitly calling the LMCache connector:
kv_cfg = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")

# --- CONFIGURATION CONSTANTS ---
MODEL = "Qwen/Qwen1.5-MoE-A2.7B" 
OFFLOAD_GB = 16  # Safer size for KV cache offload pool on CPU/CXL RAM
SWAP_GB = 4      # Reduced swap space for PagedAttention block metadata
MARGIN_GB = 0.50 # Safety margin for GPU VRAM
SAFE_UTIL_LIMIT = 0.70 # Max fraction of free VRAM to use for model + hot cache

# --- UTILITY FUNCTIONS ---

def query_all_gpus():
    """Queries NVIDIA SMI for GPU index, total memory, and free memory in GiB."""
    try:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=index,memory.total,memory.free",
            "--format=csv,noheader,nounits"
        ]).decode().strip().splitlines()
    except subprocess.CalledProcessError:
        print("[WARNING] nvidia-smi failed. Assuming a single GPU with 16GB total, 8GB free.")
        # Fallback for environments without nvidia-smi
        return [(0, 16.0, 8.0)] 
    
    gpus = []
    for line in out:
        idx_str, tot_mb, free_mb = [s.strip() for s in line.split(",")]
        # Convert MB to GB
        gpus.append((int(idx_str), int(tot_mb) / 1024.0, int(free_mb) / 1024.0)) 
    return gpus

def run_batch(llm: LLM, prompts: list[str], sp: SamplingParams):
    """Generates text, calculates total tokens, time, and peak VRAM usage."""
    # Reset stats to measure peak VRAM usage only during this batch
    torch.cuda.reset_peak_memory_stats()
    t0 = time.time()
    outs = llm.generate(prompts, sp)
    t1 = time.time()
    
    # Calculate total processed tokens (Prefill + Decode)
    total_tokens = 0
    for out in outs:
        # Prompt length (prefill tokens) + generated length (decode tokens)
        total_tokens += len(out.prompt_token_ids)
        total_tokens += sum(len(o.token_ids) for o in out.outputs)

    peak = torch.cuda.max_memory_allocated() / (1024**3)
    return (t1 - t0), total_tokens, peak, outs

# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # 1) Pick emptiest GPU and set environment
    gpus = query_all_gpus()
    best = max(gpus, key=lambda x: x[2])
    best_idx, total_gb, free_gb = best
    os.environ["CUDA_VISIBLE_DEVICES"] = str(best_idx)
    print(f"[GPU] Using GPU {best_idx} (free {free_gb:.2f} / total {total_gb:.2f} GiB)")

    if free_gb <= MARGIN_GB:
        raise RuntimeError("Not enough free VRAM for safety margin. Please free up GPU resources.")

    # Calculate safe VRAM utilization limit
    calculated_util = (free_gb - MARGIN_GB) / total_gb 
    safe_util = max(0.05, min(calculated_util * 0.90, SAFE_UTIL_LIMIT))
    print(f"[GPU] gpu_memory_utilization set to {safe_util:.3f} (conservative limit)")

    # 2) Configure LMCache and Pytorch for Tiered Offloading
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["VLLM_WORKER_MULTIPROCESSING"] = "0"
    
    # LMCache environment variables for CPU/CXL offloading
    os.environ["LMCACHE_LOCAL_CPU"] = "True"
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(float(OFFLOAD_GB))
    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
    print(f"[CPU/CXL] Configuring LMCache for {OFFLOAD_GB} GB offload size...")

    # 3) Build vLLM Engine
    print(f"Building engine for {MODEL} (CPU/CXL offload + Prefix Caching)…")
    
    # KVTransferConfig is needed to explicitly enable the LMCache backend connector
    kv_cfg = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
    
    torch.cuda.empty_cache(); gc.collect()
    
    try:
        llm = LLM(
            model=MODEL,
            dtype="half",
            max_model_len=8192,
            gpu_memory_utilization=safe_util,
            cpu_offload_gb=OFFLOAD_GB,       # 🌟 KV Cache Offloading Tier
            enable_prefix_caching=True,     # 🌟 Prefix Caching Optimization
            kv_transfer_config=kv_cfg,
            enforce_eager=True,
            swap_space=SWAP_GB,             # PagedAttention block swapping
        )
    except RuntimeError as e:
        print(f"\n[ERROR] Failed to initialize vLLM Engine. Check memory configuration.")
        print(f"Error: {e}")
        # Hint to the user about common issues:
        print(f"HINT: Try reducing OFFLOAD_GB ({OFFLOAD_GB}) and SWAP_GB ({SWAP_GB}) to lower CPU/CXL memory pressure.")
        exit(1)

    print("vLLM Engine initialized successfully.")
    
    # 4) Run Experiments to Demonstrate Benefits
    sp = SamplingParams(temperature=0.0, max_tokens=96)
    
    # Create a long prefix (e.g., 2000 tokens) to stress the cache
    base_prefix = "System: You are a precise and helpful assistant. " + ("Data chunk " * 400) 

    # --- (A) RECOMPUTE: Cache Miss Simulation ---
    # The changing suffix/salt prevents a prefix cache hit, forcing full prefill computation.
    recompute_prompts = [
        f"{base_prefix}\nUser: Q{idx} summarize the data. [session={idx%3}]"
        for idx in range(6)
    ]
    
    print("\n--- (A) RECOMPUTE (Cache Miss: Full Prefill) ---")
    dt_r, toks_r, vram_r, _ = run_batch(llm, recompute_prompts, sp)
    print(f"Time: {dt_r:.2f}s | Tokens: {toks_r} | Peak VRAM: {vram_r:.2f} GB")

    # --- (B) NO-RECOMPUTE: Cache Hit Simulation ---
    same_prompts = [
        f"{base_prefix}\nUser: Q{idx} summarize the data." # Exact same prefix
        for idx in range(6)
    ]
    
    # Run 1 (Warm-up): Computes prefix, stores/offloads KV cache.
    print("\n--- (B) WARM-UP (Cache Creation) ---")
    dt_w, toks_w, vram_w, _ = run_batch(llm, same_prompts, sp)
    print(f"Time: {dt_w:.2f}s | Tokens: {toks_w} | Peak VRAM: {vram_w:.2f} GB")
    
    # Run 2 (Reuse): Reuses the cached KV blocks from the CPU/CXL tier.
    print("\n--- (B) REUSE (Prefix Cache Hit & KV Reload) ---")
    dt_n, toks_n, vram_n, _ = run_batch(llm, same_prompts, sp) 
    print(f"Time: {dt_n:.2f}s | Tokens: {toks_n} | Peak VRAM: {vram_n:.2f} GB")

    # --- Results Summary ---
    print("\n=== Throughput (tokens/sec) Comparison ===")
    tps_r = toks_r/max(dt_r,1e-6)
    tps_w = toks_w/max(dt_w,1e-6)
    tps_n = toks_n/max(dt_n,1e-6)
    
    print(f"1. Recompute TPS (Baseline): {tps_r:.2f} t/s")
    print(f"2. Warm-up TPS (First Run):  {tps_w:.2f} t/s")
    print(f"3. Reuse TPS (Cached/Offloaded): {tps_n:.2f} t/s")
    
    print(f"\nExpected Result: Reuse TPS ({tps_n:.2f}) should be significantly higher than Recompute TPS ({tps_r:.2f}), confirming the benefit of prefix caching and tiered memory reuse.")

Collecting lmcache
  Downloading lmcache-0.3.6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cufile-python
  Downloading cufile_python-0.1.1-py3-none-any.whl (4.9 kB)
Collecting nvtx
  Downloading nvtx-0.2.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.2/474.2 KB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setuptools_scm>=8
  Downloading setuptools_scm-9.2.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 KB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Collecting awscrt
  Downloading awscrt-0.28.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m60.5 MB/s[

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:01:19,234] LMCache INFO:[0m LMCache Configuration: {'chunk_size': 256, 'local_cpu': True, 'max_local_cpu_size': '5.0 GB', 'local_disk': None, 'max_local_disk_size': '0.0 GB', 'remote_url': None, 'remote_serde': 'naive', 'use_layerwise': False, 'save_decode_cache': False, 'pre_caching_hash_algorithm': 'builtin', 'enable_blending': False, 'blend_recompute_ratios': None, 'blend_thresholds': None, 'blend_check_layers': None, 'blend_min_tokens': 256, 'blend_special_str': ' # # ', 'enable_p2p': False, 'lookup_url': None, 'distributed_url': None, 'enable_controller': False, 'lmcache_instance_id': 'lmcache_default_instance', 'controller_url': None, 'lmcache_worker_port': None, 'lmcache_worker_heartbeat_delay_time': 10, 'lmcache_worker_heartbeat_time': None, 'enable_nixl': False, 'nixl_role': None, 'nixl_receiver_host': None, 'nixl_receiver_port': None, 'nixl_buffer_size': None, 'nixl_buffer_device': None, 'nixl_enable_gc': False,

[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:01:27,256] LMCache INFO:[0m Creating LMCacheEngine with config: {'chunk_size': 256, 'local_cpu': True, 'max_local_cpu_size': 16.0, 'local_disk': None, 'max_local_disk_size': 0.0, 'remote_url': None, 'remote_serde': 'naive', 'use_layerwise': False, 'save_decode_cache': False, 'pre_caching_hash_algorithm': 'builtin', 'enable_blending': False, 'blend_recompute_ratios': None, 'blend_thresholds': None, 'blend_check_layers': None, 'blend_min_tokens': 256, 'blend_special_str': ' # # ', 'enable_p2p': False, 'lookup_url': None, 'distributed_url': None, 'enable_controller': False, 'lmcache_instance_id': 'lmcache_default_instance', 'controller_url': None, 'lmcache_worker_port': None, 'lmcache_worker_heartbeat_delay_time': 10, 'lmcache_worker_heartbeat_time': None, 'enable_nixl': False, 'nixl_role': None, 'nixl_receiver_host': None, 'nixl_receiver_port': None, 'nixl_buffer_size': None, 'nixl_buffer_device': None, 'nixl_enable_gc': Fa

[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:32 [gpu_model_runner.py:2338] Starting to load model Qwen/Qwen1.5-MoE-A2.7B...
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:32 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:32 [cuda.py:362] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:50 [weight_utils.py:348] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:56 [default_loader.py:268] Loading weights took 5.03 seconds
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:01:56 [gpu_model_runner.py:2392] Model loading took 10.0502 GiB and 23.336833 seconds
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:18 [gpu_worker.py:298] Available KV cache memory: 4.98 GiB
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:19 [kv_cache_utils.py:864] GPU KV cache size: 27,200 tokens
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:19 [kv_cache_utils.py:868] Maximum concurrency for 8,192 tokens per request: 3.32x
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:19 [utils.py:114] Connectors do not specify a kv cache layout, defaulting to NHD.
[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:19 [gpu_worker.py:391] Free memory on device (23.36/23.66 GiB) on startup. Desired GPU memory utilization is (0.7, 16.57 GiB). Actual usage is 10.05 GiB for weight, 1

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:20,714] LMCache INFO:[0m LMCache Configuration: {'chunk_size': 256, 'local_cpu': True, 'max_local_cpu_size': '5.0 GB', 'local_disk': None, 'max_local_disk_size': '0.0 GB', 'remote_url': None, 'remote_serde': 'naive', 'use_layerwise': False, 'save_decode_cache': False, 'pre_caching_hash_algorithm': 'builtin', 'enable_blending': False, 'blend_recompute_ratios': None, 'blend_thresholds': None, 'blend_check_layers': None, 'blend_min_tokens': 256, 'blend_special_str': ' # # ', 'enable_p2p': False, 'lookup_url': None, 'distributed_url': None, 'enable_controller': False, 'lmcache_instance_id': 'lmcache_default_instance', 'controller_url': None, 'lmcache_worker_port': None, 'lmcache_worker_heartbeat_delay_time': 10, 'lmcache_worker_heartbeat_time': None, 'enable_nixl': False, 'nixl_role': None, 'nixl_receiver_host': None, 'nixl_receiver_port': None, 'nixl_buffer_size': None, 'nixl_buffer_device': None, 'nixl_enable_gc': False,

[1;36m(EngineCore_DP0 pid=187370)[0;0m INFO 09-28 16:02:20 [__init__.py:3400] Cudagraph is disabled under eager mode
INFO 09-28 16:02:21 [llm.py:295] Supported_tasks: ['generate']
INFO 09-28 16:02:21 [__init__.py:36] No IOProcessor plugins requested by the model
vLLM Engine initialized successfully.

--- (A) RECOMPUTE (Cache Miss: Full Prefill) ---


Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:21,033] LMCache INFO:[0m Reqid: 0, Total tokens 824, LMCache hit tokens: 0, need to load: 0 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m




[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:21,038] LMCache INFO:[0m Post-initializing LMCacheEngine [3m(cache_engine.py:191:lmcache.v1.cache_engine)[0m


Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:24,449] LMCache INFO:[0m Storing KV cache for 824 out of 824 tokens (skip_leading_tokens=0) for request 0 [3m(vllm_v1_adapter.py:988:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:24,469] LMCache INFO:[0m Stored 824 out of total 824 tokens. size: 0.1509 gb, cost 13.4427 ms, throughput: 11.2238 GB/s; offload_time: 13.4179 ms, put_time: 0.0248 ms [3m(cache_engine.py:309:lmcache.v1.cache_engine)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:24,479] LMCache INFO:[0m Reqid: 1, Total tokens 824, LMCache hit tokens: 768, need to load: -32 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:02:24,480] LMCache INFO:[0m Reqid: 2, Total tokens 824, LMCache hit tokens: 768, need to load: -32 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v

Time: 41.66s | Tokens: 5520 | Peak VRAM: 0.00 GB

--- (B) WARM-UP (Cache Creation) ---


Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:02,691] LMCache INFO:[0m Reqid: 6, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m


Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:03,194] LMCache INFO:[0m Reqid: 7, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:03,195] LMCache INFO:[0m Reqid: 8, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:03,197] LMCache INFO:[0m Reqid: 9, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:03,198] LMCache INFO:[0m Reqid: 10, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [

Time: 29.61s | Tokens: 5490 | Peak VRAM: 0.00 GB

--- (B) REUSE (Prefix Cache Hit & KV Reload) ---


Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:32,298] LMCache INFO:[0m Reqid: 12, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m


Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:32,648] LMCache INFO:[0m Reqid: 13, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:32,649] LMCache INFO:[0m Reqid: 14, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:32,651] LMCache INFO:[0m Reqid: 15, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m [32;20m[2025-09-28 16:03:32,652] LMCache INFO:[0m Reqid: 16, Total tokens 819, LMCache hit tokens: 768, need to load: -48 [3m(vllm_v1_adapter.py:1104:lmcache.integration.vllm.vllm_v1_adapter)[0m
[1;36m(EngineCore_DP0 pid=187370)[0;0m

Time: 29.15s | Tokens: 5490 | Peak VRAM: 0.00 GB

=== Throughput (tokens/sec) Comparison ===
1. Recompute TPS (Baseline): 132.51 t/s
2. Warm-up TPS (First Run):  185.41 t/s
3. Reuse TPS (Cached/Offloaded): 188.36 t/s

Expected Result: Reuse TPS (188.36) should be significantly higher than Recompute TPS (132.51), confirming the benefit of prefix caching and tiered memory reuse.


In [2]:
import os
import gc
import time
import subprocess
import torch
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig

# --- CONFIGURATION CONSTANTS ---
MODEL = "Qwen/Qwen1.5-MoE-A2.7B" 
OFFLOAD_GB = 16  # Safer size for KV cache offload pool on CPU/CXL RAM
SWAP_GB = 4      # Reduced swap space for PagedAttention block metadata
MARGIN_GB = 0.50 # Safety margin for GPU VRAM
SAFE_UTIL_LIMIT = 0.70 # Max fraction of free VRAM to use for model + hot cache

# --- UTILITY FUNCTIONS ---

def query_all_gpus():
    """Queries NVIDIA SMI for GPU index, total memory, and free memory in GiB."""
    try:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=index,memory.total,memory.free",
            "--format=csv,noheader,nounits"
        ]).decode().strip().splitlines()
    except subprocess.CalledProcessError:
        print("[WARNING] nvidia-smi failed. Assuming a single GPU with 16GB total, 8GB free.")
        return [(0, 16.0, 8.0)]  # Fallback
    
    gpus = []
    for line in out:
        idx_str, tot_mb, free_mb = [s.strip() for s in line.split(",")]
        gpus.append((int(idx_str), int(tot_mb) / 1024.0, int(free_mb) / 1024.0)) 
    return gpus

def run_batch(llm: LLM, prompts: list[str], sp: SamplingParams):
    """Generates text and calculates total tokens and time."""
    t0 = time.time()
    outs = llm.generate(prompts, sp)
    t1 = time.time()
    
    total_tokens = 0
    for out in outs:
        # Prompt length (prefill tokens) + generated length (decode tokens)
        total_tokens += len(out.prompt_token_ids)
        total_tokens += sum(len(o.token_ids) for o in out.outputs)

    # Note: Peak VRAM measurement is removed here, as the meaningful metric is
    # the static allocated VRAM measured after engine creation (see main block).
    return (t1 - t0), total_tokens, outs

def get_current_vram_usage():
    """Fix: Measure the total allocated VRAM block after engine init."""
    return torch.cuda.memory_allocated() / (1024**3)

def build_llm(model_name, util_limit, offload_gb, swap_gb, enable_prefix_caching):
    """Factory function to build vLLM engine with specific caching config."""
    kv_cfg = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
    
    try:
        llm = LLM(
            model=model_name,
            dtype="half",
            max_model_len=8192,
            gpu_memory_utilization=util_limit,
            cpu_offload_gb=offload_gb,
            enable_prefix_caching=enable_prefix_caching,
            kv_transfer_config=kv_cfg if enable_prefix_caching else None, # Only pass LMCache config if caching is on
            enforce_eager=True,
            swap_space=swap_gb,
        )
        return llm
    except RuntimeError as e:
        print(f"\n[ERROR] Failed to initialize vLLM Engine. Config: prefix_caching={enable_prefix_caching}")
        print(f"Root Error: {e}")
        # Return None instead of calling exit(1) to allow interactive environment to continue
        return None


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- 1) Setup GPU Environment ---
    gpus = query_all_gpus()
    best = max(gpus, key=lambda x: x[2])
    best_idx, total_gb, free_gb = best
    os.environ["CUDA_VISIBLE_DEVICES"] = str(best_idx)

    calculated_util = (free_gb - MARGIN_GB) / total_gb 
    safe_util = max(0.05, min(calculated_util * 0.90, SAFE_UTIL_LIMIT))

    # --- 2) Configure LMCache Environment Variables ---
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["VLLM_WORKER_MULTIPROCESSING"] = "0"
    os.environ["LMCACHE_LOCAL_CPU"] = "True"
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(float(OFFLOAD_GB))
    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
    
    print(f"[GPU] Using GPU {best_idx} | Total VRAM: {total_gb:.2f} GiB")
    print(f"[CPU/CXL] Offload configured for {OFFLOAD_GB} GB.")

    # --- 3) Build Engines ---
    sp = SamplingParams(temperature=0.0, max_tokens=96)
    base_prefix = "System: You are a precise and helpful assistant. " + ("Data chunk " * 400)
    same_prompts = [f"{base_prefix}\nUser: Q{idx} summarize the data." for idx in range(6)]
    recompute_prompts = [f"{base_prefix}\nUser: Q{idx} summarize the data. [session={idx%3}]" for idx in range(6)]

    # Engine A: Full optimization (Prefix Caching + CXL Offload)
    llm_optimized = build_llm(MODEL, safe_util, OFFLOAD_GB, SWAP_GB, enable_prefix_caching=True)
    
    # Engine B: Baseline (Standard PagedAttention, NO Prefix Caching/Offload)
    llm_base = build_llm(MODEL, safe_util, 0, SWAP_GB, enable_prefix_caching=False)

    # --- 4) Conditional Execution ---
    if llm_optimized is None or llm_base is None:
        print("\n[FATAL] Skipping performance test because one or both engines failed to initialize.")
        # If the engine is None, the VRAM usage will be 0.00 GB, which is expected.
    else:
        # Only proceed if both engines initialized successfully
        
        vram_opt = get_current_vram_usage()
        print(f"\n[VRAM Optimized Engine] Total allocated VRAM (Weights + Cache Pool): {vram_opt:.2f} GB")
        
        vram_base = get_current_vram_usage()
        print(f"[VRAM Baseline Engine] Total allocated VRAM (Weights + Cache Pool): {vram_base:.2f} GB")
        
        # BASELINE 1: NO Prefix Caching, Full Computation
        print("\n--- BASELINE (C) NO PREFIX CACHING ---")
        dt_b, toks_b, _ = run_batch(llm_base, same_prompts, sp)
        print(f"Time: {dt_b:.2f}s | Tokens: {toks_b}")


        # BASELINE 2: Optimized Engine, Forced Cache Miss (Recompute)
        print("\n--- BASELINE (A) FULL RECOMPUTATION (Optimized Engine) ---")
        dt_r, toks_r, _ = run_batch(llm_optimized, recompute_prompts, sp)
        print(f"Time: {dt_r:.2f}s | Tokens: {toks_r}")


        # OPTIMIZED RUN: Prefix Cache Hit (Reuse from CXL)
        print("\n--- OPTIMIZED (B) WARM-UP (Cache Creation & Offload) ---")
        dt_w, toks_w, _ = run_batch(llm_optimized, same_prompts, sp)
        print(f"Time: {dt_w:.2f}s | Tokens: {toks_w}")

        print("\n--- OPTIMIZED (B) REUSE (Cache Hit & CXL Reload) ---")
        dt_n, toks_n, _ = run_batch(llm_optimized, same_prompts, sp) 
        print(f"Time: {dt_n:.2f}s | Tokens: {toks_n}")


        # --- 5) Results Summary ---
        print("\n=== Throughput (tokens/sec) Comparison ===")
        tps_b = toks_b/max(dt_b,1e-6)
        tps_r = toks_r/max(dt_r,1e-6)
        tps_w = toks_w/max(dt_w,1e-6)
        tps_n = toks_n/max(dt_n,1e-6)
        
        print(f"1. No Prefix Cache (Baseline): {tps_b:.2f} t/s")
        print(f"2. Cache Miss (Recompute):     {tps_r:.2f} t/s")
        print(f"3. Warm-up Run (First Cache):  {tps_w:.2f} t/s")
        print(f"4. Reuse Run (CXL Hit):        {tps_n:.2f} t/s")
        
        print("\nConclusion: The significant difference between Line 1/2 and Line 4 demonstrates that the **Prefix Caching** feature, backed by **CXL Offloading**, successfully skips the expensive prefill step and reloads the required data quickly, boosting throughput.")


[GPU] Using GPU 0 | Total VRAM: 23.99 GiB
[CPU/CXL] Offload configured for 16 GB.
INFO 09-28 17:19:46 [utils.py:328] non-default args: {'dtype': 'half', 'max_model_len': 8192, 'enable_prefix_caching': True, 'cpu_offload_gb': 16, 'gpu_memory_utilization': 0.23313385442110404, 'disable_log_stats': True, 'enforce_eager': True, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='882f0285-b90a-4b87-b1a2-e0b6937049a7', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None), 'model': 'Qwen/Qwen1.5-MoE-A2.7B'}
INFO 09-28 17:19:47 [__init__.py:742] Resolved architecture: Qwen2MoeForCausalLM
INFO 09-28 17:19:47 [__init__.py:1815] Using max model len 8192
INFO 09-28 17:19:47 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-28 17:19:47 [__init__.py:3400] Cudagraph is disabled under 

[1;36m(EngineCore_DP0 pid=8708)[0;0m [32;20m[2025-09-28 17:19:50,998] LMCache INFO:[0m LMCache Configuration: {'chunk_size': 256, 'local_cpu': True, 'max_local_cpu_size': '5.0 GB', 'local_disk': None, 'max_local_disk_size': '0.0 GB', 'remote_url': None, 'remote_serde': 'naive', 'use_layerwise': False, 'save_decode_cache': False, 'pre_caching_hash_algorithm': 'builtin', 'enable_blending': False, 'blend_recompute_ratios': None, 'blend_thresholds': None, 'blend_check_layers': None, 'blend_min_tokens': 256, 'blend_special_str': ' # # ', 'enable_p2p': False, 'lookup_url': None, 'distributed_url': None, 'enable_controller': False, 'lmcache_instance_id': 'lmcache_default_instance', 'controller_url': None, 'lmcache_worker_port': None, 'lmcache_worker_heartbeat_delay_time': 10, 'lmcache_worker_heartbeat_time': None, 'enable_nixl': False, 'nixl_role': None, 'nixl_receiver_host': None, 'nixl_receiver_port': None, 'nixl_buffer_size': None, 'nixl_buffer_device': None, 'nixl_enable_gc': False, '

[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[1;36m(EngineCore_DP0 pid=8708)[0;0m [32;20m[2025-09-28 17:19:59,653] LMCache INFO:[0m Creating LMCacheEngine with config: {'chunk_size': 256, 'local_cpu': True, 'max_local_cpu_size': 16.0, 'local_disk': None, 'max_local_disk_size': 0.0, 'remote_url': None, 'remote_serde': 'naive', 'use_layerwise': False, 'save_decode_cache': False, 'pre_caching_hash_algorithm': 'builtin', 'enable_blending': False, 'blend_recompute_ratios': None, 'blend_thresholds': None, 'blend_check_layers': None, 'blend_min_tokens': 256, 'blend_special_str': ' # # ', 'enable_p2p': False, 'lookup_url': None, 'distributed_url': None, 'enable_controller': False, 'lmcache_instance_id': 'lmcache_default_instance', 'controller_url': None, 'lmcache_worker_port': None, 'lmcache_worker_heartbeat_delay_time': 10, 'lmcache_worker_heartbeat_time': None, 'enable_nixl': False, 'nixl_role': None, 'nixl_receiver_host': None, 'nixl_receiver_port': None, 'nixl_buffer_size': None, 'nixl_buffer_device': None, 'nixl_enable_gc': Fals

[1;36m(EngineCore_DP0 pid=8708)[0;0m INFO 09-28 17:20:04 [gpu_model_runner.py:2338] Starting to load model Qwen/Qwen1.5-MoE-A2.7B...
[1;36m(EngineCore_DP0 pid=8708)[0;0m INFO 09-28 17:20:04 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=8708)[0;0m INFO 09-28 17:20:04 [cuda.py:362] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_DP0 pid=8708)[0;0m ERROR 09-28 17:20:23 [core.py:718] EngineCore failed to start.
[1;36m(EngineCore_DP0 pid=8708)[0;0m ERROR 09-28 17:20:23 [core.py:718] Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=8708)[0;0m ERROR 09-28 17:20:23 [core.py:718]   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
[1;36m(EngineCore_DP0 pid=8708)[0;0m ERROR 09-28 17:20:23 [core.py:718]     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pid=8708)[0;0m ERROR 09-28 17:20:23 [core.py:718]   File "/usr/local/lib/python3.10/dist-packages

[1;36m(EngineCore_DP0 pid=8708)[0;0m Process EngineCore_DP0:
[1;36m(EngineCore_DP0 pid=8708)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_DP0 pid=8708)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
[1;36m(EngineCore_DP0 pid=8708)[0;0m     self.run()
[1;36m(EngineCore_DP0 pid=8708)[0;0m   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
[1;36m(EngineCore_DP0 pid=8708)[0;0m     self._target(*self._args, **self._kwargs)
[1;36m(EngineCore_DP0 pid=8708)[0;0m   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 722, in run_engine_core
[1;36m(EngineCore_DP0 pid=8708)[0;0m     raise e
[1;36m(EngineCore_DP0 pid=8708)[0;0m   File "/usr/local/lib/python3.10/dist-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
[1;36m(EngineCore_DP0 pid=8708)[0;0m     engine_core = EngineCoreProc(*args, **kwargs)
[1;36m(EngineCore_DP0 pid=8708)[0;0m   File "/usr/local/lib/python3

KeyboardInterrupt: 