In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
strings_to_tokenize = [
    "Hello, world!",
    "This is a test string.",
    "Transformers are amazing for NLP tasks.",
    "Let's see how many tokens these strings take.",
]
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [7]:
ids = [
    tokenizer.encode(string) for string in strings_to_tokenize
]

[[9707, 11, 1879, 0],
 [1986, 374, 264, 1273, 914, 13],
 [8963, 388, 525, 7897, 369, 451, 12567, 9079, 13],
 [10061, 594, 1490, 1246, 1657, 11211, 1493, 9069, 1896, 13]]

In [10]:
padded = tokenizer.pad({"input_ids": ids}, padding="longest", return_tensors="pt")

In [13]:
padded

{'input_ids': tensor([[  9707,     11,   1879,      0, 151643, 151643, 151643, 151643, 151643,
         151643],
        [  1986,    374,    264,   1273,    914,     13, 151643, 151643, 151643,
         151643],
        [  8963,    388,    525,   7897,    369,    451,  12567,   9079,     13,
         151643],
        [ 10061,    594,   1490,   1246,   1657,  11211,   1493,   9069,   1896,
             13]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
import torch
input_ids = padded["input_ids"]
attn_mask = padded["attention_mask"]
with torch.no_grad():
    logits = model(input_ids=input_ids, attention_mask=attn_mask).logits


In [16]:
input_ids.shape

torch.Size([4, 10])

In [18]:
logits

tensor([[[ 7.2801,  5.6999,  0.6545,  ..., -3.7292, -3.7295, -3.7291],
         [ 8.3219, -1.3552,  0.0493,  ..., -2.5598, -2.5601, -2.5597],
         [18.1119,  8.2721,  7.2261,  ..., -0.6242, -0.6245, -0.6243],
         ...,
         [ 9.9240, 10.3055, 17.2309,  ..., -2.0765, -2.0767, -2.0766],
         [ 9.8248,  9.7491, 15.4928,  ..., -1.9129, -1.9130, -1.9129],
         [ 9.4980, 10.3333, 15.4334,  ..., -1.7121, -1.7124, -1.7121]],

        [[ 4.9903,  3.1928,  1.8066,  ..., -2.5430, -2.5436, -2.5429],
         [ 5.0394,  1.7928,  0.1334,  ..., -2.1265, -2.1262, -2.1263],
         [ 7.6685,  4.6036,  1.2015,  ..., -1.7160, -1.7166, -1.7159],
         ...,
         [11.1904, 13.1078, 13.6556,  ..., -0.8852, -0.8855, -0.8852],
         [ 9.6250, 10.4838, 11.2969,  ..., -1.2635, -1.2636, -1.2636],
         [ 9.0981,  9.5217, 11.5075,  ..., -2.3557, -2.3558, -2.3558]],

        [[ 3.6877,  5.9022,  1.8271,  ..., -2.1206, -2.1204, -2.1206],
         [ 8.4649,  9.3085,  6.2698,  ..., -0

In [19]:
torch.softmax(logits, dim=-1)

tensor([[[7.6809e-04, 1.5819e-04, 1.0185e-06,  ..., 1.2710e-08,
          1.2707e-08, 1.2712e-08],
         [1.7020e-04, 1.0672e-08, 4.3472e-08,  ..., 3.1997e-09,
          3.1985e-09, 3.1997e-09],
         [3.7424e-01, 1.9943e-05, 7.0070e-06,  ..., 2.7302e-09,
          2.7292e-09, 2.7300e-09],
         ...,
         [2.1800e-04, 3.1926e-04, 3.2493e-01,  ..., 1.3388e-09,
          1.3385e-09, 1.3387e-09],
         [3.1530e-04, 2.9232e-04, 9.1264e-02,  ..., 2.5184e-09,
          2.5179e-09, 2.5184e-09],
         [2.1285e-04, 4.9074e-04, 8.0499e-02,  ..., 2.8812e-09,
          2.8805e-09, 2.8812e-09]],

        [[4.9369e-06, 8.1810e-07, 2.0454e-07,  ..., 2.6410e-09,
          2.6394e-09, 2.6413e-09],
         [8.7292e-07, 3.3964e-08, 6.4612e-09,  ..., 6.7433e-10,
          6.7450e-10, 6.7446e-10],
         [1.6880e-05, 7.8760e-07, 2.6230e-08,  ..., 1.4182e-09,
          1.4175e-09, 1.4183e-09],
         ...,
         [1.1690e-04, 7.9528e-04, 1.3755e-03,  ..., 6.6596e-10,
          6.657

In [22]:
torch.softmax(logits, dim=-1).gather(-1, input_ids[:,:1].unsqueeze(-1))

tensor([[[3.1165e-06]],

        [[1.2550e-06]],

        [[3.1799e-06]],

        [[5.0264e-06]]])

In [23]:
new_lp = torch.tensor([
  [1,2,3],
  [4,5,6],
  [7,8,9],
  [10,11,12]
])

In [None]:
advantages = torch.tensor([1, 2, 3, 4])
advantages[:, None].expand_as()

In [1]:
from vllm import LLM, SamplingParams

  from .autonotebook import tqdm as notebook_tqdm


INFO 06-25 09:50:45 [__init__.py:244] Automatically detected platform cuda.


2025-06-25 09:50:46,422	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch


class ColocateWorkerExtension:
    """
    The class for vLLM's worker to inherit from, in the colocate setting.
    By defining an extension class, the code can work no matter what is
    the underlying worker class. This way, the code can be compatible
    with both vLLM V0 and V1.
    NOTE: we define this class in a separate module, and the main module
    should pass the full qualified name as `worker_extension_cls` argument.
    """

    def report_device_id(self) -> str:
        from vllm.platforms import current_platform

        self.device_uuid = current_platform.get_device_uuid(self.device.index)
        return self.device_uuid

    def init_cpu_cache(self):
        """Initializes a CPU cache for storing weight batches."""
        self.cpu_cache = {}

    def receive_and_cache_weights(self, ipc_handles_batch: dict):
        """
        Receives a batch of IPC handles and caches the corresponding tensors
        on the CPU for updating the model weights later.
        """
        if not hasattr(self, "cpu_cache"):
            self.init_cpu_cache()

        if not hasattr(self, "device_uuid"):
            self.report_device_id()

        # The ipc_handles_batch is a dictionary mapping device_uuid to
        # another dictionary of tensor_name: ipc_handle.
        if self.device_uuid not in ipc_handles_batch:
            return

        handles = ipc_handles_batch[self.device_uuid]
        device_id = self.device.index

        # Use a dedicated stream to allow for asynchronous H2D transfers.
        stream = torch.cuda.Stream()
        with torch.cuda.stream(stream):
            for name, handle in handles.items():
                func, args = handle
                list_args = list(args)
                # The 6th argument to ipc_open is the device ID. We override it
                # to ensure the tensor is created on the correct local device.
                list_args[6] = device_id
                tensor = func(*list_args)
                # Asynchronously copy the tensor to CPU to avoid blocking.
                # The tensor must be contiguous for non-blocking transfer.
                self.cpu_cache[name] = tensor.contiguous().to(
                    device="cpu", non_blocking=True
                )

        # Wait for all asynchronous transfers to complete.
        stream.synchronize()

    def load_weights_from_cache(self):
        """
        Loads the complete set of weights from the CPU cache into the model
        on the GPU.
        """
        if not hasattr(self, "cpu_cache") or not self.cpu_cache:
            print(
                "No weights in CPU cache to load. Ensure that `receive_and_cache_weights` was called."
            )
            return

        # The model_runner's load_weights method handles moving the tensors
        # from CPU to the correct GPU device.
        weights = list(self.cpu_cache.items())
        self.model_runner.model.load_weights(weights=weights)
        torch.cuda.synchronize()

        # Clean up the cache to free memory
        self.cpu_cache = {}

    def print_weights(self):
        print(self.model_runner.model.state_dict().keys())

In [3]:
engine_kwargs = {}
engine_kwargs[
    "worker_extension_cls"
] = "actors.inference.rlhf_utils.ColocateWorkerExtension"


In [4]:
import os
os.environ['VLLM_USE_V1'] = '0'

In [5]:
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", tensor_parallel_size=1, quantization='fp8', **engine_kwargs)

INFO 06-25 09:51:10 [config.py:823] This model supports multiple tasks: {'embed', 'reward', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 06-25 09:51:11 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  9.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  9.15it/s]



NORMAL 06-25 09:51:13 [default_loader.py:272] Loading weights took 0.13 seconds
NORMAL 06-25 09:51:14 [model_runner.py:1203] Model loading took 0.5922 GiB and 0.682266 seconds
NORMAL 06-25 09:51:15 [worker.py:294] Memory profiling takes 0.75 seconds
NORMAL 06-25 09:51:15 [worker.py:294] the current vLLM instance can use total_gpu_memory (23.62GiB) x gpu_memory_utilization (0.90) = 21.26GiB
NORMAL 06-25 09:51:15 [worker.py:294] model weights take 0.59GiB; non_torch_memory takes 0.32GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 18.91GiB.
NORMAL 06-25 09:51:15 [executor_base.py:113] # cuda blocks: 103276, # CPU blocks: 21845
NORMAL 06-25 09:51:15 [executor_base.py:118] Maximum concurrency for 32768 tokens per request: 50.43x
NORMAL 06-25 09:51:18 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '-

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.19it/s]


NORMAL 06-25 09:51:34 [model_runner.py:1671] Graph capturing finished in 16 secs, took 0.18 GiB
NORMAL 06-25 09:51:34 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 19.69 seconds


In [14]:
# get state dict.
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
                      [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
                      [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
                      ...,
        

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=torch.bfloat16, device_map="cpu")

In [40]:
model.state_dict()['model.layers.0.self_attn.q_proj.weight'].shape

torch.Size([896, 896])

In [30]:
model.state_dict()['model.layers.0.self_attn.k_proj.bias'].shape

torch.Size([128])

In [31]:
model.state_dict()['model.layers.0.self_attn.v_proj.bias'].shape

torch.Size([128])

In [47]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     device='cuda:0', dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
                      [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
                      [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
                      ...,
        

In [32]:
896+128*2

1152

In [38]:
torch.concat([
  model.state_dict()['model.layers.0.self_attn.q_proj.bias'],
  model.state_dict()['model.layers.0.self_attn.k_proj.bias'],
  model.state_dict()['model.layers.0.self_attn.v_proj.bias']
])

tensor([-0.0150,  0.0255, -0.1035,  ..., -0.0049, -0.0212,  0.0166],
       dtype=torch.bfloat16)

In [None]:
# Two stages.
# 1. Combine q_proj, k_proj, v_proj into qkv_proj both bias and weight.
import re
from typing import OrderedDict
import torch
from collections import OrderedDict
from vllm import _custom_ops as ops

def fp8_quantize_state_dict(sd):
    out = OrderedDict()
    for k, v in sd.items():
        if v.ndim == 2 and "embed" not in k and "embedding" not in k and not 'lm_head' in k and not 'bias' in k and not 'norm' in k:
            q, s = ops.scaled_fp8_quant(v.cuda(), scale=None)
            out[k] = q.T
            out[k.replace(".weight", ".weight_scale")] = s
        else:
            out[k] = v
    return out

_QKV_PAT = re.compile(r"\.self_attn\.(q|k|v)_proj\.(weight|bias)$")

def merge_qkv(state_dict):
    out_sd, cache = OrderedDict(), {}
    for k, v in state_dict.items():
        m = _QKV_PAT.search(k)
        if m is None:
            out_sd[k] = v
            continue
        prefix, typ, what = k[:m.start()], m.group(1), m.group(2)
        bucket = cache.setdefault((prefix, what), {})
        bucket[typ] = v
        if len(bucket) == 3:
            out_sd[f"{prefix}.self_attn.qkv_proj.{what}"] = torch.cat([bucket['q'], bucket['k'], bucket['v']], 0)
            del cache[(prefix, what)]
    return out_sd

_GU = re.compile(r"\.mlp\.(gate|up)_proj\.(weight|bias)$")

def merge_gate_and_up_proj(sd):
    out, buf = OrderedDict(), {}
    for k, v in sd.items():
        m = _GU.search(k)
        if m is None:
            out[k] = v
            continue
        pre, part, typ = k[:m.start()], m.group(1), m.group(2)
        b = buf.setdefault((pre, typ), {})
        b[part] = v
        if len(b) == 2:
            fused = torch.cat([b['gate'], b['up']], 0)
            out[f"{pre}.mlp.gate_up_proj.{typ}"] = fused
            del buf[(pre, typ)]
    assert not buf
    return out


In [136]:
state_dict = model.state_dict()
new_state_dict = merge_qkv(state_dict)
new_state_dict = merge_gate_and_up_proj(new_state_dict)
new_state_dict = fp8_quantize_state_dict(new_state_dict)

In [137]:
new_state_dict.keys() - llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict().keys()

set()

In [138]:
# We check if they have the same keys and shapes.
for k in new_state_dict:
    if k not in llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict():
        print(f"Key {k} is missing in the vLLM model state dict.")
    elif new_state_dict[k].shape != llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].shape:
        print(f"Shape mismatch for key {k}: {new_state_dict[k].shape} vs {llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].shape}")
    elif not torch.allclose(new_state_dict[k].cpu().to(torch.float32), llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].to(torch.float32).cpu(), atol=1):
        print(f"Values mismatch for key {k}.")

In [126]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()['model.layers.0.self_attn.qkv_proj.weight']

tensor([[ -0.6875,   3.0000,  -6.0000,  ...,  -2.0000,   0.6875,   5.0000],
        [ -1.8750,   0.6875,  -9.0000,  ...,  -5.5000,  -3.2500, -12.0000],
        [  7.0000,  16.0000,  15.0000,  ...,  -3.5000,   4.5000, -10.0000],
        ...,
        [ -2.2500,   2.5000,   3.2500,  ...,  -3.0000,  -9.0000, -10.0000],
        [ -5.5000, -15.0000,  -0.2812,  ...,  -5.5000,   2.7500,  -2.5000],
        [  1.3750,  -6.5000,  -3.5000,  ...,   3.5000,   7.0000,  -4.0000]],
       device='cuda:0', dtype=torch.float8_e4m3fn)

In [139]:
llm.llm_engine.model_executor.driver_worker.model_runner.model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [None]:
ops.scaled_fp8_quant()

In [109]:
new_state_dict['model.layers.23.self_attn.o_proj.weight']

tensor([[ -1.6250, -12.0000,   9.0000,  ...,  26.0000,  -1.2500,  10.0000],
        [ 12.0000,  -0.8750,  -3.0000,  ...,   5.5000,  -3.2500,  -0.3125],
        [  2.5000, -13.0000, -14.0000,  ..., -32.0000,   4.5000, -11.0000],
        ...,
        [  2.2500,  14.0000,  -2.7500,  ...,  -5.5000,   6.0000, -11.0000],
        [-18.0000, -16.0000,  11.0000,  ...,   2.2500, -12.0000,  -8.0000],
        [ -8.0000, -10.0000,  -7.0000,  ...,  16.0000,  -2.7500,  -1.0000]],
       dtype=torch.float8_e4m3fn)

In [133]:
k = 'model.layers.23.self_attn.o_proj.weight'
(new_state_dict[k].cpu().to(torch.float32) - llm.llm_engine.model_executor.driver_worker.model_runner.model.state_dict()[k].to(torch.float32).cpu())

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [46]:
new_state_dict

OrderedDict([('model.embed_tokens.weight',
              tensor([[-0.0104,  0.0408,  0.0097,  ...,  0.0098,  0.0136, -0.0067],
                      [-0.0146, -0.0014, -0.0177,  ..., -0.0024,  0.0024, -0.0081],
                      [-0.0366, -0.0102,  0.0078,  ..., -0.0074, -0.0177, -0.0007],
                      ...,
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187],
                      [ 0.0060, -0.0053,  0.0033,  ..., -0.0082, -0.0082,  0.0187]],
                     dtype=torch.bfloat16)),
             ('model.layers.0.self_attn.qkv_proj.weight',
              tensor([[-0.0019, -0.0052,  0.0188,  ..., -0.0061, -0.0153,  0.0038],
                      [ 0.0084,  0.0018,  0.0435,  ...,  0.0066, -0.0422, -0.0181],
                      [-0.0168, -0.0248,  0.0422,  ...,  0.0089, -0.0008, -0.0094],
                      ...,
                      [-0.0054, -0.0156, -0