In [6]:
import re
import json
import os
import requests
from threading import Thread


def load_prompts(path="prompts.json"):
    ret = {}
    with open(path, "r", encoding="utf-8") as f:
        prompts = json.load(f)  # 修正这里
        for p in prompts:
            ret[p["category"]] = p
    return ret

In [None]:
def get_eval_prompt(question, answer, question_type, reference, prompts):
    # question_type: factual, multiple_choice, instruction, roleplay, open_ended
    mapping = {
        "factual": "knowledge",
        "multiple_choice": "multiple_choice",
        "instruction": "instruction",
        "roleplay": "roleplay",
        "open_ended": "open_ended",
    }
    prompt_key = mapping.get(question_type, "knowledge")
    prompt = prompts[prompt_key]
    sys_prompt = prompt["system_prompt"]
    if prompt_key == "multiple_choice":
        prompt_text = prompt["prompt_template"].format(
            question=question, answer=answer, correct_answer=reference or ""
        )
    else:
        prompt_text = prompt["prompt_template"].format(question=question, answer=answer)
    return sys_prompt, prompt_text


def evaluate_answer(
    model, question, answer, question_type, reference, prompts, eval_model="gpt-4o"
):
    sys_prompt, eval_prompt = get_eval_prompt(
        question, answer, question_type, reference, prompts
    )
    payload = json.dumps(
        {
            "model": eval_model,
            "messages": [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": eval_prompt},
            ],
        }
    )
    headers = {
        "Accept": "application/json",
        "Authorization": "sk-************************************************",
        "User-Agent": "DeerAPI/1.0.0 (https://api.deerapi.com)",
        "Content-Type": "application/json",
    }F
    # return 10, "good"
    try:
        resp = requests.post(
            "https://api.deerapi.com/v1/chat/completions",
            headers=headers,
            data=payload,
            timeout=60,
        )
        content = resp.json()["choices"][0]["message"]["content"]
        with open(f"evaluations/{model}_eval_prompt.txt", "a", encoding="utf-8") as f:
            f.write(f"问题: \n{question}\n")
            f.write(f"回答: \n{answer}\n")
            f.write(f"评估提示: \n{eval_prompt}\n")
            f.write(f"评估结果: \n{content}\n\n")

        score_match = re.search(r"\[\[(\d+)\]\]", content)
        score = int(score_match.group(1)) if score_match else None
        return score, content
    except Exception as e:
        return None, f"评估失败: {e}"


def evaluate_model_result(model, prompts, eval_model="gpt-4.1"):
    os.makedirs("evaluations", exist_ok=True)
    with open(f"result/{model}_results.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f if line.strip()]
    eval_results = []
    for idx, item in enumerate(results):
        question = item.get("question")
        answer = item.get("response")
        question_type = item.get("question_type", "factual")
        reference = None
        if "reference" in item:
            reference = (
                item["reference"][0]
                if isinstance(item["reference"], list)
                else item["reference"]
            )
        score, eval_text = evaluate_answer(
            model, question, answer, question_type, reference, prompts, eval_model
        )
        item["evaluation_score"] = score
        item["evaluation_text"] = eval_text
        eval_results.append(item)
        print(f"模型{model} 问题{item.get('question_id')} 评分: {score}")
        # time.sleep(1)  # 防止API限流
    with open(f"evaluations/{model}_evaluation.json", "w", encoding="utf-8") as f:
        for item in eval_results:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print(f"{model} 评估完成，结果已保存到 evaluations/{model}_evaluation.json")

In [None]:
models = [
    "gpt-4o-mini",
    "gpt-3.5-turbo",
    "deepseek-chat",
    "claude-3-5-haiku-20241022",
    "gemini-2.5-flash-preview-04-17",
]

nice = True
for model in models:
    with open(f"result/{model}_results.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f]
    for result in results:
        question = result.get("question")
        answer = result.get("response")
        if answer is None:
            nice = False
            print(f"模型{model} 问题{result.get('question_id')} 没有回答")

if nice:
    print("所有模型都有回答，可以开始评估")

所有模型都有回答，可以开始评估


In [9]:
prompts = load_prompts("prompts.json")
print(prompts)

{'knowledge': {'name': 'knowledge-v1', 'type': 'single', 'system_prompt': 'You are an expert evaluator with deep domain knowledge.', 'prompt_template': "[Instruction]\nYou are evaluating the response to a factual knowledge question. Please assess the accuracy, completeness, and clarity of the answer. Consider whether all key facts are correctly presented, whether the answer addresses all aspects of the question, and whether the explanation is clear and well-structured. After your evaluation, rate the response on a scale of 1-10 by providing your rating in this format: [[rating]].\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", 'description': 'Prompt for evaluating factual knowledge questions', 'category': 'knowledge', 'output_format': '[[rating]]'}, 'multiple_choice': {'name': 'multiple-choice-v1', 'type': 'single', 'system_prompt': 'You are an objective evaluator for multiple-choice questions.', 'prompt_template': "[Instructi

In [12]:
# 评估所有模型
threads = [Thread(target=evaluate_model_result, args=(model, prompts)) for model in models]

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

模型gemini-2.5-flash-preview-04-17 问题101 评分: 9
模型gpt-3.5-turbo 问题101 评分: 9
模型deepseek-chat 问题101 评分: 8
模型claude-3-5-haiku-20241022 问题101 评分: 9
模型gpt-3.5-turbo 问题102 评分: 10
模型deepseek-chat 问题102 评分: 10
模型gemini-2.5-flash-preview-04-17 问题102 评分: 9
模型claude-3-5-haiku-20241022 问题102 评分: 10
模型deepseek-chat 问题103 评分: 10
模型gpt-3.5-turbo 问题103 评分: 10
模型gemini-2.5-flash-preview-04-17 问题103 评分: 9
模型claude-3-5-haiku-20241022 问题103 评分: 9
模型deepseek-chat 问题104 评分: 9
模型deepseek-chat 问题105 评分: 9
模型gpt-3.5-turbo 问题104 评分: 9
模型claude-3-5-haiku-20241022 问题104 评分: 9
模型gpt-3.5-turbo 问题105 评分: 9
模型claude-3-5-haiku-20241022 问题105 评分: 10
模型claude-3-5-haiku-20241022 问题201 评分: 10
模型gemini-2.5-flash-preview-04-17 问题104 评分: 9
模型gpt-3.5-turbo 问题201 评分: 10
模型deepseek-chat 问题201 评分: 10
模型gemini-2.5-flash-preview-04-17 问题105 评分: 10
模型deepseek-chat 问题202 评分: 9
模型gemini-2.5-flash-preview-04-17 问题201 评分: 10
模型gpt-3.5-turbo 问题202 评分: 10
模型claude-3-5-haiku-20241022 问题202 评分: 8
模型deepseek-chat 问题203 评分: 10
模型claude-3-5-haik