In [None]:
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
mode_path = '/home/ubuntu/public/wdq_workspace/Qwen2/models/qwen/Qwen2-7B'
lora_path = '/home/ubuntu/public/wdq_workspace/Qwen2/examples/sft/output_qwen/checkpoint-1080' 

In [None]:
def test(mode_path, lora_path, prompt, content):
    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
    # 加载模型
    model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.float16, trust_remote_code=True).eval()
    # 加载lora权重
    model = PeftModel.from_pretrained(model, model_id=lora_path)
    inputs = tokenizer.apply_chat_template([{"role": "system", "content": prompt},{"role": "user", "content": content}],
                                        add_generation_prompt=True,
                                        tokenize=True,
                                        return_tensors="pt",
                                        return_dict=True
                                        ).to('cuda')

    gen_kwargs = {"do_sample": True, "top_k": 1}
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
prompt = '你是一个专业医生，以专业的知识回答用户'
content ='请问您能提供一些关于腰椎间盘突出症病例的分析和治疗效果的建议吗？'

In [None]:
test(mode_path, lora_path, prompt, content)

### vLLM推理

离线推理

In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

sampling_params = SamplingParams(
    temperature=0.7, 
    top_p=0.8, 
    repetition_penalty=1.05, # 控制重复词组的惩罚系数，值大于 1 会减少重复，值等于 1 表示不应用惩罚。
    max_tokens=1024
)

llm = LLM(
    MODEL_PATH,
    max_model_len=2048,  # 降低这个值以减少内存占用
    gpu_memory_utilization=0.9  # 值越大 GPU 内存越高
)

In [None]:
# 部署GPTQ量化模型
# llm = LLM(
#     QUA_MODEL_PATH,
#     quantization="gptq",
#     max_model_len=2048,  # 降低这个值以减少内存占用
#     gpu_memory_utilization=0.9 
# )

In [None]:
prompt = "请问在治疗神经源性环咽肌失迟缓症的病例中，导尿管球囊扩张术的具体效果如何？"
messages = [
    {"role": "system", "content": "你是一个专业医生，以专业的知识回答用户。"},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True)

In [None]:
# generate outputs
outputs = llm.generate([text], sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r},\nGenerated text: {generated_text!r}")

API接口推理

In [None]:
# ./run_vllm.sh

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:8000/v1",
)

completion = client.chat.completions.create(
    model="qwen-1.5-7b-lora",
    messages=[
        {"role": "system", "content": "你是一个专业医生，以专业的知识回答用户。"},
        {"role": "user", "content": "请问在治疗神经源性环咽肌失迟缓症的病例中，导尿管球囊扩张术的具体效果如何？"},
    ],
    temperature=0.7,
    top_p=0.8,
    max_tokens=1024,
)
response = completion.choices[0].message.content
print(response)