## vLLM 本地大模型部署和推理

In [2]:
# Qwen2-vLLM-Local.py
import os
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
 
# 设置环境变量
os.environ['VLLM_TARGET_DEVICE'] = 'cuda'
 
# 模型ID：我们下载的模型权重文件目录
model_dir = './Qwen2___5-3B-Instruct'
 
# Tokenizer初始化
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    local_files_only=True,
)
 
# Prompt提示词
messages = [
    {'role': 'system', 'content': 'You are a helpful assistant.'},
    {'role': 'user', 'content': '天空为什么是蓝色的？'}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
 
# 初始化大语言模型
llm = LLM(
    model=model_dir,
    tensor_parallel_size=1,  # CPU无需张量并行
    device='cuda',
)
 
# 超参数：最多512个Token
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
 
# 模型推理输出
outputs = llm.generate([text], sampling_params)
 
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
 
    print(f'Prompt提示词: {prompt!r}, 大模型推理输出: {generated_text!r}')

INFO 10-01 22:45:06 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='./Qwen2___5-3B-Instruct', speculative_config=None, tokenizer='./Qwen2___5-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen2___5-3B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-01 22:45:12 model_runner.py:1025] Loading model weights took 5.7915 GB
INFO 10-01 22:45:14 gpu_executor.py:122] # GPU blocks: 48781, # CPU blocks: 7281
INFO 10-01 22:45:17 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-01 22:45:17 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-01 22:45:33 model_runner.py:1456] Graph capturing finished in 16 secs.


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it, est. speed input: 17.07 toks/s, output: 101.72 toks/s]

Prompt提示词: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n天空为什么是蓝色的？<|im_end|>\n<|im_start|>assistant\n', 大模型推理输出: '这是由于大气散射的原因。天空呈现出蓝色，主要是因为太阳光中的短波光（如蓝光和紫光）更容易被大气中的气体分子（主要是氮气和氧气）散射。尽管所有颜色的光都被散射了，但蓝光由于波长较短，更容易被散射到各个方向，因此我们看到的天空就是蓝色的。\n\n在日出和日落时分，太阳接近地平线，光线需要穿过更多的大气层才能到达观察者的眼睛，这个过程中蓝光和紫光都被散射掉了，剩下的主要是红光、橙光和黄光，这就是为什么日出和日落时天空会呈现红色、橙色或黄色的原因。'





## 发布 API 服务和调用推理

本地部署推理只能在一台服务器完成，我们也通过vLLM把本地大模型部署成 OpenAI API 服务：

In [None]:
python -m vllm.entrypoints.openai.api_server --model ./Qwen2___5-3B-Instruct
# 修改端口
python -m vllm.entrypoints.openai.api_server --model ./Qwen2___5-3B-Instruct --port 8000 --host 0.0.0.0

API 服务部署成功之后，可以通过 CURL 命令验证服务：

In [None]:
# Qwen2-vLLM-CURL.py
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
  "model": "./Qwen2___5-3B-Instruct",
  "messages": [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "天空为什么是蓝色的？"}
  ],
  "temperature": 0.7,
  "top_p": 0.8,
  "repetition_penalty": 1.05,
  "max_tokens": 512
}'

或者，我们可以通过 Python 客户端调用API 访问服务：

若没有安装openai依赖包，需要提前安装一下：`pip install openai`

In [None]:
# Qwen2-vLLM-OpenAI.py
from openai import OpenAI
 
# OpenAI初始化
client = OpenAI(
    api_key='EMPTY',
    base_url='http://localhost:8000/v1',
)
 
chat_response = client.chat.completions.create(
    model='./Qwen2___5-3B-Instruct',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': '天空为什么是蓝色的？'},
    ],
    temperature=0.7,
    top_p=0.8,
    max_tokens=512,
)
 
print('Qwen2推理结果:', chat_response)

我们还可以通过 WebUI 访问我们部署的 API 服务：

In [None]:
# Qwen2-vLLM-WebUI.py
import argparse
import json
 
import gradio as gr
import requests
 
 
def http_bot(prompt):
    headers = {"User-Agent": "vLLM Client"}
    pload = {
        "prompt": prompt,
        "stream": True,
        "max_tokens": 128,
    }
    response = requests.post(args.model_url,
                             headers=headers,
                             json=pload,
                             stream=True)
 
    for chunk in response.iter_lines(chunk_size=8192,
                                     decode_unicode=False,
                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
            yield output
 
 
def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# vLLM text completion demo\n")
        inputbox = gr.Textbox(label="Input",
                              placeholder="Enter text and press ENTER")
        outputbox = gr.Textbox(label="Output",
                               placeholder="Generated result from the model")
        inputbox.submit(http_bot, [inputbox], [outputbox])
    return demo
 
 
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default=None)
    parser.add_argument("--port", type=int, default=8001)
    parser.add_argument("--model-url",
                        type=str,
                        default="http://0.0.0.0:8000/generate")
    args = parser.parse_args()
 
    demo = build_demo()
    demo.queue().launch(server_name=args.host,
                        server_port=args.port,
                        share=True)

## GPU 多卡部署和推理大模型

我们可以通过`tensor_parallel_size`参数启用 GPU 多卡分布式并行推理能力，提高大模型推理的吞吐量。

In [None]:
# Qwen2-vLLM-Local.py
import os
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
 
# 设置环境变量
os.environ['VLLM_TARGET_DEVICE'] = 'cuda'
 
# 模型ID：我们下载的模型权重文件目录
model_dir = './Qwen2___5-3B-Instruct'
 
# Tokenizer初始化
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    local_files_only=True,
)
 
# Prompt提示词
messages = [
    {'role': 'system', 'content': 'You are a helpful assistant.'},
    {'role': 'user', 'content': '天空为什么是蓝色的？'}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
 
# 初始化大语言模型
llm = LLM(
    model=model_dir,
    tensor_parallel_size=4,  # 4卡张量并行
    device='cuda',
)
 
# 超参数：最多512个Token
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
 
# 模型推理输出
outputs = llm.generate([text], sampling_params)
 
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
 
    print(f'Prompt提示词: {prompt!r}, 大模型推理输出: {generated_text!r}')

同样的，也可以通过 `--tensor-parallel-size` 参数部署并发布 API 服务：

In [None]:
python -m vllm.entrypoints.api_server --model ./Qwen2___5-3B-Instruct --tensor-parallel-size 4