In [1]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.4.3-cp310-cp310-manylinux1_x86_64.whl (131.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting ninja (from vllm)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from vllm)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting openai (from vllm)
  Downloading openai-1.32.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.2/325.2 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [1]:
from vllm.model_executor.models import ModelRegistry

# 先保存原始的 is_embedding_model 方法，以便之后需要时可以恢复
original_is_embedding_model = ModelRegistry.is_embedding_model

# 定义一个新的方法，该方法不管传入什么都返回 True
def always_true_is_embedding_model(model_arch: str) -> bool:
    return True

# 将 ModelRegistry 类中的 is_embedding_model 方法替换为 always_true_is_embedding_model
ModelRegistry.is_embedding_model = always_true_is_embedding_model

# 现在调用 ModelRegistry.is_embedding_model 无论如何都会返回 True
print(ModelRegistry.is_embedding_model("any_model_architecture"))  # 输出 True

# 如果需要恢复原始方法，可以这样做：
# ModelRegistry.is_embedding_model = original_is_embedding_model


True


In [2]:
from typing import Iterable, List, Optional, Tuple

import torch
from torch import nn

from vllm.attention import AttentionMetadata
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import PoolerOutput

# not same as mistral one
class MyLlamaEmbeddingModel(nn.Module):
    """A model that uses Llama with additional embedding functionalities.

   This class encapsulates the LlamaModel and provides an interface for
   embedding operations and customized pooling functions.

   Attributes:
       model: An instance of LlamaModel used for forward operations.
       _pooler: An instance of Pooler used for pooling operations.
   """

    def __init__(
        self,
        **kwargs,
    ) -> None:
        super().__init__()
        self.model = LlamaModel(**kwargs)
        self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        return self.model.forward(input_ids, positions, kv_caches,
                                  attn_metadata, inputs_embeds)

    def pooler(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> Optional[PoolerOutput]:
        return self._pooler(hidden_states, pooling_metadata)

    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # 建议这个直接抄你用的那个模型的，Mistral和LLaMA不太一样
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            (".qkv_proj", ".q_proj", "q"),
            (".qkv_proj", ".k_proj", "k"),
            (".qkv_proj", ".v_proj", "v"),
            (".gate_up_proj", ".gate_proj", 0),
            (".gate_up_proj", ".up_proj", 1),
        ]
        params_dict = dict(self.named_parameters())
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name:
                continue
            if ("rotary_emb.cos_cached" in name
                    or "rotary_emb.sin_cached" in name):
                # Models trained using ColossalAI may include these tensors in
                # the checkpoint. Skip them.
                continue
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                if name.endswith("kv_scale"):
                    remapped_kv_scale_name = name.replace(
                        ".kv_scale", ".attn.kv_scale")
                    if remapped_kv_scale_name not in params_dict:
                        # print_warning_once(
                        #     f"Found kv scale in the checkpoint (e.g. {name}), "
                        #     "but not found the expected name in the model "
                        #     f"(e.g. {remapped_kv_scale_name}). kv-scale is "
                        #     "not loaded.")
                        continue
                    else:
                        name = remapped_kv_scale_name
                if "lm_head" in name: # 看具体情况 你lm head 可能多了
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)


In [3]:
from vllm import ModelRegistry

ModelRegistry.register_model("MyLlamaEmbeddingModel", MyLlamaEmbeddingModel)

In [12]:
!git lfs install
!git clone https://huggingface.co/princeton-nlp/Sheared-LLaMA-1.3B

Git LFS initialized.
Cloning into 'Sheared-LLaMA-1.3B'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 58 (delta 28), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (58/58), 484.80 KiB | 2.69 MiB/s, done.
Filtering content: 100% (2/2), 1.01 GiB | 4.10 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	pytorch_model.bin

See: `git lfs help smudge` for more details.


In [13]:
!cat ./Sheared-LLaMA-1.3B/config.json

{
  "_name_or_path": "princeton-nlp/Sheared-LLaMA-1.3B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "use_cache": true,
  "vocab_size": 32000
}


修改model arth的名字 (Change `architectures` name to)

`MyLlamaEmbeddingModel`

In [15]:
!cat ./Sheared-LLaMA-1.3B/config.json

{
  "_name_or_path": "princeton-nlp/Sheared-LLaMA-1.3B",
  "architectures": [
    "MyLlamaEmbeddingModel"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "use_cache": true,
  "vocab_size": 32000
}


In [4]:
from vllm import LLM



In [5]:

# Create an LLM.
model = LLM(model="./Sheared-LLaMA-1.3B", enforce_eager=True, dtype="float16")


INFO 06-07 06:39:53 config.py:1151] Downcasting torch.float32 to torch.float16.
INFO 06-07 06:39:53 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='./Sheared-LLaMA-1.3B', speculative_config=None, tokenizer='./Sheared-LLaMA-1.3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=./Sheared-LLaMA-1.3B)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


INFO 06-07 06:39:53 selector.py:120] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 06-07 06:39:53 selector.py:51] Using XFormers backend.
INFO 06-07 06:39:57 selector.py:120] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 06-07 06:39:57 selector.py:51] Using XFormers backend.
INFO 06-07 06:40:22 model_runner.py:146] Loading model weights took 2.4211 GB


In [6]:
# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]


# Generate embedding. The output is a list of EmbeddingRequestOutputs.

outputs = model.encode(prompts)
# Print the outputs.
for output in outputs:
    print(output.outputs.embedding)  # list of 4096 floats

Processed prompts: 100%|██████████| 4/4 [00:02<00:00,  1.65it/s, Generation Speed: 0.00 toks/s]

[-0.0036258697509765625, -0.0118408203125, -0.0080413818359375, -0.0016002655029296875, -0.0138092041015625, -0.00799560546875, 0.02264404296875, 0.00533294677734375, 0.01311492919921875, -0.01486968994140625, 0.0104217529296875, 0.0086517333984375, -0.0049896240234375, 0.00016760826110839844, -0.005245208740234375, 0.018585205078125, 0.008819580078125, -0.004444122314453125, -0.031158447265625, 0.030181884765625, -0.00745391845703125, -0.004215240478515625, 0.018798828125, -0.005565643310546875, -0.0010042190551757812, -0.0103759765625, -0.00562286376953125, 0.01485443115234375, 0.0284271240234375, -0.006092071533203125, -0.0185089111328125, -0.0027294158935546875, 0.017608642578125, 0.019683837890625, -0.004001617431640625, -0.0016317367553710938, -0.00569915771484375, 0.00502777099609375, 0.0003733634948730469, -0.006683349609375, -0.0010585784912109375, -0.0182952880859375, -0.00502777099609375, -0.0191497802734375, 0.0212249755859375, 0.0171966552734375, 0.01096343994140625, -0.01


