In [4]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

In [None]:


# 基础模型路径
base_model = "./pretrained_models/Qwen/Qwen2.5-0.5B-Instruct"
# LoRA权重路径
lora_path = "./models weights/chat_model/Qwen2.5-0.5B-Instruct/lora/train_2025-03-10-15-22-21"

# 初始化模型（启用LoRA支持）
llm = LLM(
    model=base_model,
    enable_lora=True,
    max_lora_rank=64,    # ✅ 唯一需要设置 max_lora_rank 的地方
    max_loras=2,        # 允许同时加载的LoRA数量
    max_cpu_loras=2,    # CPU内存中保留的LoRA数量
    tensor_parallel_size=1,
    # gpu_memory_utilization=0.8,
    # enforce_eager=True,
    # max_model_len=4096,
    # quantization="awq",
)

# 定义LoRA请求
lora_request = LoRARequest(
    lora_name="kurisu_lora",  # 自定义名称，用于区分不同LoRA
    lora_local_path=lora_path,
    lora_int_id = 1,
)

# 生成参数配置
sampling_params = SamplingParams(
    temperature=0.8,
    top_k=50,
    top_p=0.9,
    max_tokens=512,
    frequency_penalty=1.2,  # 抑制重复
    stop=["\n###", "</end>"], # 自定义停止标记
    # max_tokens=512,              # 减少生成长度
    # skip_special_tokens=True,     # 避免特殊token处理开销
)



In [3]:
with open("./character info/角色介绍.txt", "r", encoding="utf-8") as f:
    intro = f.readlines()
with open("./character info/角色设定.txt", "r", encoding="utf-8") as f:
    chara = f.readlines()

In [None]:
import time
def chat_with_history(query, sampling_params, lora_request, introduction="", chara="", history=[]):

    def build_multiturn_prompt(system_prompt: str, history: list[tuple[str, str]], query: str) -> str:
        """构建含历史对话的prompt"""
        prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
        for user_msg, assistant_msg in history:
            prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
            prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
        prompt += f"<|im_start|>user\n{query}<|im_end|>\n"
        prompt += "<|im_start|>assistant\n"
        return prompt
    
    # 使用示例
    # system_prompt = f"""你是牧濑红莉栖，扮演傲娇毒舌但内心温柔的天才少女，偶尔流露出笨拙的一面。根据角色介绍模拟，以下是该角色介绍:{introduction}。"""
    system_prompt = f"""你输出的文本只能是台词，不能有其他描述。角色介绍:{introduction}。角色设定:{chara}
"""
    
    prompt = build_multiturn_prompt(system_prompt, history, query)
    outputs = llm.generate(
        prompt,
        sampling_params,
        lora_request=lora_request  # 关键：传入LoRA请求
    )
    # 输出结果
    answer = outputs[0].outputs[0].text
    history.append([query, answer])
    return history, answer

def chat(sampling_params, lora_request):
    history = []
    with open("./角色介绍.txt", "r", encoding="utf-8") as f:
        introduction = f.readlines()
    while True:
        query = input("请输入 (exit退出): ")
        if query.lower() == "exit":
            break
        history, answer = chat_with_history(query, sampling_params, lora_request, introduction, chara, history)
        print(answer)
        time.sleep(1)
    return answer
chat(sampling_params, lora_request)