In [1]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.6.0-cp38-abi3-manylinux1_x86_64.whl.metadata (2.2 kB)
Collecting fastapi (from vllm)
  Downloading fastapi-0.114.0-py3-none-any.whl.metadata (27 kB)
Collecting openai>=1.0 (from vllm)
  Downloading openai-1.44.0-py3-none-any.whl.metadata (22 kB)
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting lm-format-enforcer==0.10.6 (from vllm)
  Downloading lm_format_enforcer-0.10.6-py3-none-any.whl.metadata (16 kB)
Collecting outlines<0.1,>=0.0.43 (from vllm)
  Downloading outlines-0.0.46-py3-none-any.whl.metadata (15 kB)
Collecting partial-json-parser (from vllm)
  Downloading partial_json_parser-0.2.1

In [2]:
from vllm.model_executor.models import ModelRegistry

# 先保存原始的 is_embedding_model 方法，以便之后需要时可以恢复
original_is_embedding_model = ModelRegistry.is_embedding_model

# 定义一个新的方法，该方法不管传入什么都返回 True
def always_true_is_embedding_model(model_arch: str) -> bool:
    return True

# 将 ModelRegistry 类中的 is_embedding_model 方法替换为 always_true_is_embedding_model
ModelRegistry.is_embedding_model = always_true_is_embedding_model

# 现在调用 ModelRegistry.is_embedding_model 无论如何都会返回 True
print(ModelRegistry.is_embedding_model("any_model_architecture"))  # 输出 True

# 如果需要恢复原始方法，可以这样做：
# ModelRegistry.is_embedding_model = original_is_embedding_model

INFO 09-09 14:41:22 importing.py:10] Triton not installed; certain GPU-related functions will not be available.
True


In [14]:
from typing import Iterable, List, Optional, Tuple

import torch
from torch import nn

from vllm.attention import AttentionMetadata
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import PoolerOutput
from vllm.model_executor.models.utils import is_pp_missing_parameter, make_layers

# not same as mistral one
class MyQwen2EmbeddingModel(nn.Module):
    """A model that uses Llama with additional embedding functionalities.

   This class encapsulates the LlamaModel and provides an interface for
   embedding operations and customized pooling functions.

   Attributes:
       model: An instance of LlamaModel used for forward operations.
       _pooler: An instance of Pooler used for pooling operations.
   """

    def __init__(
        self,
        **kwargs,
    ) -> None:
        super().__init__()
        self.model = Qwen2Model(**kwargs)
        self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        return self.model.forward(input_ids, positions, kv_caches,
                                  attn_metadata, inputs_embeds)

    def pooler(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> Optional[PoolerOutput]:
        return self._pooler(hidden_states, pooling_metadata)

    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name:
                continue
            # 这里改一下哦，如果有config
            if self.model.config.tie_word_embeddings and "lm_head.weight" in name: # model.config 这个要改一下
                continue
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue

                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue

                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)


In [15]:
from vllm import ModelRegistry

ModelRegistry.register_model("MyQwen2EmbeddingModel", MyQwen2EmbeddingModel)

In [6]:
!git clone https://huggingface.co/Qwen/Qwen2-0.5B-Instruct

Cloning into 'Qwen2-0.5B-Instruct'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 36 (delta 14), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (36/36), 3.60 MiB | 5.64 MiB/s, done.


In [7]:
!cat /content/Qwen2-0.5B-Instruct/config.json

{
  "architectures": [
    "MyQwen2EmbeddingModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.1",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}


! 注意我改了！

In [16]:
from vllm import LLM

# Create an LLM.
model = LLM(model="/content/Qwen2-0.5B-Instruct", enforce_eager=True, dtype="float16")


INFO 09-09 14:48:25 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='/content/Qwen2-0.5B-Instruct', speculative_config=None, tokenizer='/content/Qwen2-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/content/Qwen2-0.5B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False, use_asy

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 09-09 14:48:27 model_runner.py:926] Loading model weights took 0.9221 GB


In [17]:

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]


# Generate embedding. The output is a list of EmbeddingRequestOutputs.

outputs = model.encode(prompts)
# Print the outputs.
for output in outputs:
    print(output.outputs.embedding)  # list of 4096 floats
    print(len(output.outputs.embedding))


Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 104.10it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[-0.00820159912109375, 0.0137939453125, -0.01505279541015625, 0.0088043212890625, 0.0006413459777832031, 0.01136016845703125, 0.0008006095886230469, 0.048004150390625, 0.0033931732177734375, 0.0008649826049804688, -0.016143798828125, 0.0005412101745605469, -0.0127410888671875, -0.01198577880859375, 0.0004444122314453125, 0.010528564453125, -0.08929443359375, -0.0125274658203125, -0.02325439453125, 0.01087188720703125, -0.00885009765625, 0.00020551681518554688, -0.00714111328125, 0.0033435821533203125, -0.0169830322265625, -0.005542755126953125, 0.00638580322265625, -0.006439208984375, -0.0032939910888671875, -0.0160064697265625, 0.0015850067138671875, -0.0237884521484375, 0.006378173828125, -0.006793975830078125, -0.0014524459838867188, 0.0084991455078125, 0.0031070709228515625, -0.0023250579833984375, -0.0307464599609375, -0.027801513671875, -0.0799560546875, 4.410743713378906e-05, 0.01206207275390625, -0.0012359619140625, -0.0252532958984375, 0.0167388916015625, -0.0178375244140625, 


