In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
strings_to_tokenize = [
    "Hello, world!",
    "This is a test string.",
    "Transformers are amazing for NLP tasks.",
    "Let's see how many tokens these strings take.",
]
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [4]:
ids = [
    tokenizer.encode(string) for string in strings_to_tokenize
]

In [5]:
padded = tokenizer.pad({"input_ids": ids}, padding="longest", return_tensors="pt")

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [6]:
padded

{'input_ids': tensor([[  9707,     11,   1879,      0, 151643, 151643, 151643, 151643, 151643,
         151643],
        [  1986,    374,    264,   1273,    914,     13, 151643, 151643, 151643,
         151643],
        [  8963,    388,    525,   7897,    369,    451,  12567,   9079,     13,
         151643],
        [ 10061,    594,   1490,   1246,   1657,  11211,   1493,   9069,   1896,
             13]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
import torch
input_ids = padded["input_ids"]
attn_mask = padded["attention_mask"]
with torch.no_grad():
    logits = model(input_ids=input_ids, attention_mask=attn_mask).logits


In [8]:
input_ids.shape

torch.Size([4, 10])

In [9]:
logits

tensor([[[ 7.2801,  5.6999,  0.6545,  ..., -3.7292, -3.7295, -3.7291],
         [ 8.3219, -1.3552,  0.0493,  ..., -2.5598, -2.5601, -2.5597],
         [18.1119,  8.2721,  7.2261,  ..., -0.6242, -0.6245, -0.6243],
         ...,
         [ 9.9240, 10.3055, 17.2309,  ..., -2.0765, -2.0767, -2.0766],
         [ 9.8248,  9.7491, 15.4928,  ..., -1.9129, -1.9130, -1.9129],
         [ 9.4980, 10.3333, 15.4334,  ..., -1.7121, -1.7124, -1.7121]],

        [[ 4.9903,  3.1928,  1.8066,  ..., -2.5430, -2.5436, -2.5429],
         [ 5.0394,  1.7928,  0.1334,  ..., -2.1265, -2.1262, -2.1263],
         [ 7.6685,  4.6036,  1.2015,  ..., -1.7160, -1.7166, -1.7159],
         ...,
         [11.1904, 13.1078, 13.6556,  ..., -0.8852, -0.8855, -0.8852],
         [ 9.6250, 10.4838, 11.2969,  ..., -1.2635, -1.2636, -1.2636],
         [ 9.0981,  9.5217, 11.5075,  ..., -2.3557, -2.3558, -2.3558]],

        [[ 3.6877,  5.9022,  1.8271,  ..., -2.1206, -2.1204, -2.1206],
         [ 8.4649,  9.3085,  6.2698,  ..., -0

In [10]:
torch.softmax(logits, dim=-1)

tensor([[[7.6809e-04, 1.5819e-04, 1.0185e-06,  ..., 1.2710e-08,
          1.2707e-08, 1.2712e-08],
         [1.7020e-04, 1.0672e-08, 4.3472e-08,  ..., 3.1997e-09,
          3.1985e-09, 3.1997e-09],
         [3.7424e-01, 1.9943e-05, 7.0070e-06,  ..., 2.7302e-09,
          2.7292e-09, 2.7300e-09],
         ...,
         [2.1800e-04, 3.1926e-04, 3.2493e-01,  ..., 1.3388e-09,
          1.3385e-09, 1.3387e-09],
         [3.1530e-04, 2.9232e-04, 9.1264e-02,  ..., 2.5184e-09,
          2.5179e-09, 2.5184e-09],
         [2.1285e-04, 4.9074e-04, 8.0499e-02,  ..., 2.8812e-09,
          2.8805e-09, 2.8812e-09]],

        [[4.9369e-06, 8.1810e-07, 2.0454e-07,  ..., 2.6410e-09,
          2.6394e-09, 2.6413e-09],
         [8.7292e-07, 3.3964e-08, 6.4612e-09,  ..., 6.7433e-10,
          6.7450e-10, 6.7446e-10],
         [1.6880e-05, 7.8760e-07, 2.6230e-08,  ..., 1.4182e-09,
          1.4175e-09, 1.4183e-09],
         ...,
         [1.1690e-04, 7.9528e-04, 1.3755e-03,  ..., 6.6596e-10,
          6.657

In [11]:
torch.softmax(logits, dim=-1).gather(-1, input_ids[:,:1].unsqueeze(-1))

tensor([[[3.1165e-06]],

        [[1.2550e-06]],

        [[3.1799e-06]],

        [[5.0264e-06]]])

In [12]:
new_lp = torch.tensor([
  [1,2,3],
  [4,5,6],
  [7,8,9],
  [10,11,12]
])

In [13]:
advantages = torch.tensor([1, 2, 3, 4])
advantages[:, None].expand_as()

TypeError: expand_as() missing 1 required positional arguments: "other"

In [None]:
from vllm import LLM, SamplingParams

  from .autonotebook import tqdm as notebook_tqdm


INFO 06-25 09:50:45 [__init__.py:244] Automatically detected platform cuda.


2025-06-25 09:50:46,422	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch


class ColocateWorkerExtension:
    """
    The class for vLLM's worker to inherit from, in the colocate setting.
    By defining an extension class, the code can work no matter what is
    the underlying worker class. This way, the code can be compatible
    with both vLLM V0 and V1.
    NOTE: we define this class in a separate module, and the main module
    should pass the full qualified name as `worker_extension_cls` argument.
    """

    def report_device_id(self) -> str:
        from vllm.platforms import current_platform

        self.device_uuid = current_platform.get_device_uuid(self.device.index)
        return self.device_uuid

    def init_cpu_cache(self):
        """Initializes a CPU cache for storing weight batches."""
        self.cpu_cache = {}

    def receive_and_cache_weights(self, ipc_handles_batch: dict):
        """
        Receives a batch of IPC handles and caches the corresponding tensors
        on the CPU for updating the model weights later.
        """
        if not hasattr(self, "cpu_cache"):
            self.init_cpu_cache()

        if not hasattr(self, "device_uuid"):
            self.report_device_id()

        # The ipc_handles_batch is a dictionary mapping device_uuid to
        # another dictionary of tensor_name: ipc_handle.
        if self.device_uuid not in ipc_handles_batch:
            return

        handles = ipc_handles_batch[self.device_uuid]
        device_id = self.device.index

        # Use a dedicated stream to allow for asynchronous H2D transfers.
        stream = torch.cuda.Stream()
        with torch.cuda.stream(stream):
            for name, handle in handles.items():
                func, args = handle
                list_args = list(args)
                # The 6th argument to ipc_open is the device ID. We override it
                # to ensure the tensor is created on the correct local device.
                list_args[6] = device_id
                tensor = func(*list_args)
                # Asynchronously copy the tensor to CPU to avoid blocking.
                # The tensor must be contiguous for non-blocking transfer.
                self.cpu_cache[name] = tensor.contiguous().to(
                    device="cpu", non_blocking=True
                )

        # Wait for all asynchronous transfers to complete.
        stream.synchronize()

    def load_weights_from_cache(self):
        """
        Loads the complete set of weights from the CPU cache into the model
        on the GPU.
        """
        if not hasattr(self, "cpu_cache") or not self.cpu_cache:
            print(
                "No weights in CPU cache to load. Ensure that `receive_and_cache_weights` was called."
            )
            return

        # The model_runner's load_weights method handles moving the tensors
        # from CPU to the correct GPU device.
        weights = list(self.cpu_cache.items())
        self.model_runner.model.load_weights(weights=weights)
        torch.cuda.synchronize()

        # Clean up the cache to free memory
        self.cpu_cache = {}

    def print_weights(self):
        print(self.model_runner.model.state_dict().keys())

In [None]:
engine_kwargs = {}
engine_kwargs[
    "worker_extension_cls"
] = "actors.inference.rlhf_utils.ColocateWorkerExtension"


In [None]:
import os
os.environ['VLLM_USE_V1'] = '0'

In [None]:
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", tensor_parallel_size=1, quantization='fp8', **engine_kwargs)

INFO 06-25 09:51:10 [config.py:823] This model supports multiple tasks: {'embed', 'reward', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 06-25 09:51:11 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  9.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  9.15it/s]



NORMAL 06-25 09:51:13 [default_loader.py:272] Loading weights took 0.13 seconds
NORMAL 06-25 09:51:14 [model_runner.py:1203] Model loading took 0.5922 GiB and 0.682266 seconds
NORMAL 06-25 09:51:15 [worker.py:294] Memory profiling takes 0.75 seconds
NORMAL 06-25 09:51:15 [worker.py:294] the current vLLM instance can use total_gpu_memory (23.62GiB) x gpu_memory_utilization (0.90) = 21.26GiB
NORMAL 06-25 09:51:15 [worker.py:294] model weights take 0.59GiB; non_torch_memory takes 0.32GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 18.91GiB.
NORMAL 06-25 09:51:15 [executor_base.py:113] # cuda blocks: 103276, # CPU blocks: 21845
NORMAL 06-25 09:51:15 [executor_base.py:118] Maximum concurrency for 32768 tokens per request: 50.43x
NORMAL 06-25 09:51:18 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '-

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.19it/s]


NORMAL 06-25 09:51:34 [model_runner.py:1671] Graph capturing finished in 16 secs, took 0.18 GiB
NORMAL 06-25 09:51:34 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 19.69 seconds


In [None]:
# get state dict.
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
                      [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
                      [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
                      ...,
        

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=torch.bfloat16, device_map="cpu")

In [None]:
model.state_dict()['model.layers.0.self_attn.q_proj.weight'].shape

torch.Size([896, 896])

In [None]:
model.state_dict()['model.layers.0.self_attn.k_proj.bias'].shape

torch.Size([128])

In [None]:
model.state_dict()['model.layers.0.self_attn.v_proj.bias'].shape

torch.Size([128])

In [None]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
                      [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
                      [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
                      ...,
        

In [None]:
896+128*2

1152

In [None]:
torch.concat([
  model.state_dict()['model.layers.0.self_attn.q_proj.bias'],
  model.state_dict()['model.layers.0.self_attn.k_proj.bias'],
  model.state_dict()['model.layers.0.self_attn.v_proj.bias']
])

tensor([-0.0150,  0.0255, -0.1035,  ..., -0.0049, -0.0212,  0.0166],
       dtype=torch.bfloat16)

In [None]:
# Two stages.
# 1. Combine q_proj, k_proj, v_proj into qkv_proj both bias and weight.
import re
from typing import OrderedDict
import torch
from collections import OrderedDict
from vllm import _custom_ops as ops

def fp8_quantize_state_dict(sd):
    out = OrderedDict()
    for k, v in sd.items():
        if v.ndim == 2 and "embed" not in k and "embedding" not in k and not 'lm_head' in k and not 'bias' in k and not 'norm' in k:
            q, s = ops.scaled_fp8_quant(v.cuda(), scale=None)
            out[k] = q.T
            out[k.replace(".weight", ".weight_scale")] = s
        else:
            out[k] = v
    return out

_QKV_PAT = re.compile(r"\.self_attn\.(q|k|v)_proj\.(weight|bias)$")

def merge_qkv(state_dict):
    out_sd, cache = OrderedDict(), {}
    for k, v in state_dict.items():
        m = _QKV_PAT.search(k)
        if m is None:
            out_sd[k] = v
            continue
        prefix, typ, what = k[:m.start()], m.group(1), m.group(2)
        bucket = cache.setdefault((prefix, what), {})
        bucket[typ] = v
        if len(bucket) == 3:
            out_sd[f"{prefix}.self_attn.qkv_proj.{what}"] = torch.cat([bucket['q'], bucket['k'], bucket['v']], 0)
            del cache[(prefix, what)]
    return out_sd

_GU = re.compile(r"\.mlp\.(gate|up)_proj\.(weight|bias)$")

def merge_gate_and_up_proj(sd):
    out, buf = OrderedDict(), {}
    for k, v in sd.items():
        m = _GU.search(k)
        if m is None:
            out[k] = v
            continue
        pre, part, typ = k[:m.start()], m.group(1), m.group(2)
        b = buf.setdefault((pre, typ), {})
        b[part] = v
        if len(b) == 2:
            fused = torch.cat([b['gate'], b['up']], 0)
            out[f"{pre}.mlp.gate_up_proj.{typ}"] = fused
            del buf[(pre, typ)]
    assert not buf
    return out


In [None]:
state_dict = model.state_dict()
new_state_dict = merge_qkv(state_dict)
new_state_dict = merge_gate_and_up_proj(new_state_dict)
new_state_dict = fp8_quantize_state_dict(new_state_dict)

In [None]:
new_state_dict.keys() - llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict().keys()

set()

In [None]:
# We check if they have the same keys and shapes.
for k in new_state_dict:
    if k not in llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict():
        print(f"Key {k} is missing in the vLLM model state dict.")
    elif new_state_dict[k].shape != llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].shape:
        print(f"Shape mismatch for key {k}: {new_state_dict[k].shape} vs {llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].shape}")
    elif not torch.allclose(new_state_dict[k].cpu().to(torch.float32), llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].to(torch.float32).cpu(), atol=1):
        print(f"Values mismatch for key {k}.")

In [None]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()['model.layers.0.self_attn.qkv_proj.weight']

tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
        [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
        [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
        ...,
        [ -2.2500,   2.5000,   3.2500,  ...,  -3.0000,  -9.0000, -10.0000],
        [ -5.5000, -15.0000,  -0.2812,  ...,  -5.5000,   2.7500,  -2.5000],
        [  1.3750,  -6.5000,  -3.5000,  ...,   3.5000,   7.0000,  -4.0000]],
       device='cuda:0', dtype=torch.float8_e4m3fn)

In [None]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [None]:
ops.scaled_fp8_quant()

In [None]:
new_state_dict['model.layers.23.self_attn.o_proj.weight']

tensor([[ -1.6250, -12.0000,   9.0000,  ...,  26.0000,  -1.2500,  10.0000],
        [ 12.0000,  -0.8750,  -3.0000,  ...,   5.5000,  -3.2500,  -0.3125],
        [  2.5000, -13.0000, -14.0000,  ..., -32.0000,   4.5000, -11.0000],
        ...,
        [  2.2500,  14.0000,  -2.7500,  ...,  -5.5000,   6.0000, -11.0000],
        [-18.0000, -16.0000,  11.0000,  ...,   2.2500, -12.0000,  -8.0000],
        [ -8.0000, -10.0000,  -7.0000,  ...,  16.0000,  -2.7500,  -1.0000]],
       dtype=torch.float8_e4m3fn)

In [None]:
k = 'model.layers.23.self_attn.o_proj.weight'
(new_state_dict[k].cpu().to(torch.float32) - llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].to(torch.float32).cpu())

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
new_state_dict

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[-0.0019, -0.0052,  0.0188,  ..., -0.0061, -0.0153,  0.0038],
                      [ 0.0084,  0.0018,  0.0435,  ...,  0.0066, -0.0422, -0.0181],
                      [-0.0168, -0.0248,  0.0422,  ...,  0.0089, -0.0008, -0.0094],
                      ...,
                      [-0.0054, -0.0156, -0

In [1]:
from vllm import LLM, SamplingParams
import os
os.environ['VLLM_USE_V1'] = '0'

  from .autonotebook import tqdm as notebook_tqdm


INFO 06-27 20:48:30 [__init__.py:244] Automatically detected platform cuda.


2025-06-27 20:48:31,893	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
llm = LLM(model='unsloth/Qwen3-0.6B-bnb-4bit', enable_lora=True, gpu_memory_utilization=0.6)

INFO 06-27 20:48:38 [config.py:823] This model supports multiple tasks: {'embed', 'classify', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 06-27 20:48:39 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='unsloth/Qwen3-0.6B-bnb-4bit', speculative_config=None, tokenizer='unsloth/Qwen3-0.6B-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=bitsandbytes, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_end

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 66.32it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  8.30it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  8.27it/s]


INFO 06-27 20:48:41 [punica_selector.py:19] Using PunicaWrapperGPU.





INFO 06-27 20:48:41 [model_runner.py:1203] Model loading took 0.5369 GiB and 1.059959 seconds
INFO 06-27 20:48:43 [worker.py:294] Memory profiling takes 1.34 seconds
INFO 06-27 20:48:43 [worker.py:294] the current vLLM instance can use total_gpu_memory (23.62GiB) x gpu_memory_utilization (0.60) = 14.17GiB
INFO 06-27 20:48:43 [worker.py:294] model weights take 0.54GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.46GiB; the rest of the memory reserved for KV Cache is 12.10GiB.
INFO 06-27 20:48:43 [executor_base.py:113] # cuda blocks: 7080, # CPU blocks: 2340
INFO 06-27 20:48:43 [executor_base.py:118] Maximum concurrency for 40960 tokens per request: 2.77x
INFO 06-27 20:48:46 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decre

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:19<00:00,  1.79it/s]

INFO 06-27 20:49:06 [model_runner.py:1671] Graph capturing finished in 20 secs, took 0.55 GiB
INFO 06-27 20:49:06 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 24.46 seconds





In [3]:
from vllm import SamplingParams
from vllm.lora.request import LoRARequest

lora_request = LoRARequest(
  lora_name='aa',
  lora_int_id=1,
  lora_path='KhushRai78/qwen3-0.6B-GRPO-LORA',
)
llm.generate(
  "a",
  sampling_params=SamplingParams(
    temperature=0.1,
    max_tokens=1,
  ),
  lora_request=lora_request
)

Adding requests: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 268435.46it/s]put: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.91it/s, est. speed input: 4.92 toks/s, output: 4.92 toks/s]


[RequestOutput(request_id=0, prompt='a', prompt_token_ids=[64], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='.', token_ids=(13,), cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1751050146.3109818, last_token_time=1751050147.1860511, first_scheduled_time=1751050146.9873798, first_token_time=1751050147.1860511, time_in_queue=0.6763980388641357, finished_time=1751050147.1862142, scheduler_time=0.00036106300831306726, model_forward_time=None, model_execute_time=None, spec_token_acceptance_counts=[0]), lora_request=LoRARequest(lora_name='aa', lora_int_id=1, lora_path='KhushRai78/qwen3-0.6B-GRPO-LORA', lora_local_path=None, long_lora_max_len=None, base_model_name=None, tensorizer_config_dict=None), num_cached_tokens=0, multi_modal_placeholders={})]

In [4]:
llm.chat(
  [
    {
      "role": "user",
      "content": "What is the capital of France?"
    }
  ],
  sampling_params=SamplingParams(
    temperature=0.1,
    max_tokens=200,
  ),
  lora_request=lora_request
)

INFO 06-27 20:49:08 [chat_utils.py:420] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Adding requests: 100%|██████████| 1/1 [00:00<00:00, 3591.01it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s, est. speed input: 21.81 toks/s, output: 187.54 toks/s]


[RequestOutput(request_id=1, prompt=None, prompt_token_ids=[151644, 872, 198, 3838, 374, 279, 6722, 315, 9625, 30, 151645, 198, 151644, 77091, 198], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text="<think>\nOkay, the user is asking about the capital of France. I need to make sure I recall the correct answer. France's capital is Paris. I should confirm that there's no confusion with other countries. For example, in some other countries, like Germany, the capital is Berlin, but that's a different country. So, the answer should be Paris. I should also mention that if there's any doubt, it's best to double-check with a reliable source. But since the user is asking directly, the answer is clear. No need to overcomplicate it.\n</think>\n\nThe capital of France is **Paris**.", token_ids=(151667, 198, 32313, 11, 279, 1196, 374, 10161, 911, 279, 6722, 315, 9625, 13, 358, 1184, 311, 1281, 2704, 358, 19091, 279, 4396, 4226, 13, 962

In [67]:
lora_request = LoRARequest(
  lora_name='aa',
  lora_int_id=1,
  lora_path='cocks',
)
llm.chat(
  [
    {
      "role": "user",
      "content": "What is the capital of France?"
    }
  ],
  sampling_params=SamplingParams(
    temperature=0.1,
    max_tokens=200,
  ),
  lora_request=lora_request
)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 5706.54it/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it, est. speed input: 14.54 toks/s, output: 193.90 toks/s]


[RequestOutput(request_id=5, prompt=None, prompt_token_ids=[151644, 872, 198, 3838, 374, 279, 6722, 315, 9625, 30, 151645, 198, 151644, 77091, 198], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' is\n\n\n is\n\n\n\n\n\n\n\n\n\n\n\n.*.*\n\n\n\n\n\n.**\n\n\n\n\n\n\n\n\n\n****\n\n********\n\n\n\n\n\n\n\n*\n*\n\n\n\n\n\n*\n\n\n*\n\n\n\n \n***\n \n**\n**************\n \n\n\n\n\n**************** \n*******************\n\n\n\n\n\n.*\n\n\n****************\n\n.*.***\n\n**\n\n********\n\n*******\n  \n**** \n\n \n***** \n *  \n*** * * * * * * * * * * * * * * * *', token_ids=(374, 1406, 374, 198, 198, 198, 198, 34583, 198, 4908, 4908, 1406, 198, 198, 198, 4908, 9, 198, 1406, 1406, 1406, 9, 9, 9, 9, 198, 198, 334, 334, 334, 334, 198, 198, 198, 198, 198, 198, 198, 198, 9, 198, 9, 198, 198, 198, 198, 198, 198, 9, 1406, 9, 198, 1406, 715, 9, 9, 9, 198, 715, 9, 9, 198, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 198, 715, 198, 1406, 9, 9

In [6]:
keys = llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager._registered_adapters[1].loras.keys()
keys

dict_keys(['model.layers.0.mlp.down_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.10.mlp.down_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.11.mlp.down_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.12.mlp.down_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.13.mlp.down_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.14.mlp.down_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.15.mlp.down_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.16.mlp.down_proj', 'model.layers.16.self_attn.o_proj', 'model.layers.17.mlp.down_proj', 'model.layers.17.self_attn.o_proj', 'model.layers.18.mlp.down_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.19.mlp.down_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.2.mlp.down_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.20.mlp.down_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.21.mlp.down_pro

In [7]:
loras = llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager._registered_adapters[1].loras

In [52]:
for k in keys:
  if type(loras[k].lora_a) != list:
    loras[k].lora_a.data *= 1000
    loras[k].lora_b.data *= 1000


In [69]:
llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager.modules['model.layers.0.self_attn.qkv_proj'].__dict__

{'training': True,
 '_parameters': {},
 '_buffers': {},
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': {'base_layer': QKVParallelLinear(in_features=1024, output_features=4096, bias=False, tp_size=1, gather_output=False)},
 'input_size': 1024,
 'device': device(type='cuda', index=0),
 'lora_bias_stacked': None,
 'is_merged_col_linear': False,
 'tp_size': 1,
 'output_size': 4096,
 'n_slices': 3,
 'tp_rank': 0,
 'output_slices': (2048, 1024, 1024),
 'output_ids': (0, 0, 0),
 'q_proj_shard_size': 204

In [55]:
for k in keys:
  if type(loras[k].lora_a) != list:
    print(k, loras[k].lora_a.data, loras[k].lora_b.data)


model.layers.0.mlp.down_proj tensor([[inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        ...,
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf]], dtype=torch.bfloat16) tensor([[inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        ...,
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf]], dtype=torch.bfloat16)
model.layers.0.self_attn.o_proj tensor([[inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        ...,
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf],
        [inf, inf, inf,  ..., inf, inf, inf]], dtype=torch.bfloat16) ten

In [None]:

llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager._registered_adapters[1].loras['model.layers.23.mlp.gate_up_proj'].__dict__

{'module_name': 'model.layers.23.mlp.gate_proj',
 'rank': 16,
 'lora_alpha': 0,
 'lora_a': [tensor([[ 0.0039,  0.0060,  0.0198,  ..., -0.0090, -0.0079, -0.0195],
          [ 0.0151,  0.0311, -0.0204,  ..., -0.0132,  0.0058,  0.0074],
          [-0.0310,  0.0187,  0.0211,  ..., -0.0200,  0.0178, -0.0107],
          ...,
          [-0.0228,  0.0066, -0.0053,  ..., -0.0195,  0.0143,  0.0109],
          [ 0.0277,  0.0309, -0.0030,  ...,  0.0069, -0.0024,  0.0262],
          [ 0.0146, -0.0042, -0.0260,  ...,  0.0280, -0.0040,  0.0195]],
         dtype=torch.bfloat16),
  tensor([[-0.0034,  0.0114, -0.0123,  ..., -0.0175, -0.0064,  0.0231],
          [-0.0053,  0.0178, -0.0254,  ...,  0.0251,  0.0231,  0.0190],
          [ 0.0222,  0.0042, -0.0225,  ...,  0.0029,  0.0028,  0.0245],
          ...,
          [-0.0068, -0.0264,  0.0308,  ...,  0.0278,  0.0030,  0.0177],
          [-0.0084, -0.0265,  0.0056,  ..., -0.0234, -0.0179, -0.0265],
          [-0.0229,  0.0167,  0.0302,  ..., -0.0088, -0

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto")
from peft import PeftModel
model = PeftModel.from_pretrained(model, "lora_adapter/")
print(model)

  from .autonotebook import tqdm as notebook_tqdm


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(



In [None]:
model.state_dict().keys()

odict_keys(['base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.q_norm.weight', 'base_model

OrderedDict([('base_model.model.model.embed_tokens.weight',
              tensor([[-0.0093,  0.0337, -0.0747,  ...,  0.0120, -0.0106,  0.0160],
                      [ 0.0320,  0.0238, -0.0593,  ..., -0.0023, -0.0349,  0.0090],
                      [ 0.0267,  0.0339, -0.0198,  ..., -0.0099,  0.0063,  0.0226],
                      ...,
                      [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057],
                      [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057],
                      [ 0.0060,  0.0131,  0.0190,  ...,  0.0020,  0.0075,  0.0057]],
                     device='cuda:0', dtype=torch.float16)),
             ('base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight',
              tensor([[134],
                      [ 76],
                      [166],
                      ...,
                      [ 51],
                      [ 11],
                      [143]], device='cuda:0', dtype=torch.uint8)),
             ('base_mod

In [None]:
llm.llm_engine.model_executor.add_lora(
  lora_request=
)

  lora_request=LoRARequest(


IndexError: tuple index out of range

In [None]:
llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager.__dict__

{'_lora_model_cls': vllm.lora.models.LoRAModel,
 'embedding_modules': {},
 'embedding_padding_modules': [],
 '_cached_dummy_lora': False,
 'max_num_seqs': 256,
 'max_num_batched_tokens': 40960,
 'vocab_size': 151936,
 'lora_config': LoRAConfig(max_lora_rank=512, max_loras=1, fully_sharded_loras=False, max_cpu_loras=1, lora_dtype=torch.bfloat16, lora_extra_vocab_size=256, long_lora_scaling_factors=None, bias_enabled=False),
 'max_position_embeddings': 40960,
 'device': device(type='cuda'),
 '_adapter_manager': <vllm.lora.models.LRUCacheLoRAModelManager at 0x7896360483d0>}

In [None]:
llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager._registered_adapters

LoRALRUCache({1: <vllm.lora.models.LoRAModel object at 0x7895bc334ca0>}, maxsize=1, currsize=1)

In [None]:
llm.llm_engine.model_executor.driver_worker.worker.model_runner.lora_manager._adapter_manager._registered_adapters[1].__dict__

{'id': 1,
 'scaling_factor': None,
 'rank': 16,
 'loras': {'model.layers.0.mlp.down_proj': <vllm.lora.lora.LoRALayerWeights at 0x7608001167d0>,
  'model.layers.0.self_attn.o_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f85a74f0>,
  'model.layers.1.mlp.down_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867c580>,
  'model.layers.1.self_attn.o_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867d090>,
  'model.layers.10.mlp.down_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867d150>,
  'model.layers.10.self_attn.o_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867c8b0>,
  'model.layers.11.mlp.down_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867c700>,
  'model.layers.11.self_attn.o_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867c610>,
  'model.layers.12.mlp.down_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867c5b0>,
  'model.layers.12.self_attn.o_proj': <vllm.lora.lora.LoRALayerWeights at 0x7607f867cf70>,
  'model.layers.13.mlp.down_proj': <vllm.lora.lora.L

In [None]:
llm.llm_engine.model_executor.driver_worker.worker.model_runner.add_lora

NameError: name 'llm' is not defined

In [None]:
llm.llm_engine.add_lora(
    lora_request=LoRARequest(
      lora_name='lora_1',
      lora_int_id=1,
      lora_local_path='local_path_here',
  )
)

In [None]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.load_lora

AttributeError: 'Qwen3ForCausalLM' object has no attribute 'load_lora'