In [3]:
import re
import json
import os
import requests
from threading import Thread


def load_prompts(path="prompts.json"):
    ret = {}
    with open(path, "r", encoding="utf-8") as f:
        prompts = json.load(f)  # 修正这里
        for p in prompts:
            ret[p["category"]] = p
    return ret

In [None]:
def get_eval_prompt(question, answer, question_type, reference, prompts):
    # question_type: factual, multiple_choice, instruction, roleplay, open_ended
    mapping = {
        "factual": "knowledge",
        "multiple_choice": "multiple_choice",
        "instruction": "instruction",
        "roleplay": "roleplay",
        "open_ended": "open_ended",
    }
    prompt_key = mapping.get(question_type, "knowledge")
    prompt = prompts[prompt_key]
    sys_prompt = prompt["system_prompt"]
    if prompt_key == "multiple_choice":
        prompt_text = prompt["prompt_template"].format(
            question=question, answer=answer, correct_answer=reference or ""
        )
    else:
        prompt_text = prompt["prompt_template"].format(question=question, answer=answer)
    return sys_prompt, prompt_text


def evaluate_answer(
    model, question, answer, question_type, reference, prompts, eval_model="gpt-4o"
):
    sys_prompt, eval_prompt = get_eval_prompt(
        question, answer, question_type, reference, prompts
    )
    payload = json.dumps(
        {
            "model": eval_model,
            "messages": [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": eval_prompt},
            ],
        }
    )
    headers = {
        "Accept": "application/json",
        "Authorization": "sk-************************************************",
        "User-Agent": "DeerAPI/1.0.0 (https://api.deerapi.com)",
        "Content-Type": "application/json",
    }
    # return 10, "good"
    try:
        resp = requests.post(
            "https://api.deerapi.com/v1/chat/completions",
            headers=headers,
            data=payload,
            timeout=60,
        )
        content = resp.json()["choices"][0]["message"]["content"]
        with open(f"evaluations/{model}_eval_prompt.txt", "a", encoding="utf-8") as f:
            f.write(f"问题: \n{question}\n")
            f.write(f"回答: \n{answer}\n")
            f.write(f"评估提示: \n{eval_prompt}\n")
            f.write(f"评估结果: \n{content}\n\n")

        score_match = re.search(r"\[\[(\d+)\]\]", content)
        score = int(score_match.group(1)) if score_match else None
        return score, content
    except Exception as e:
        return None, f"评估失败: {e}"


def evaluate_model_result(model, prompts, eval_model="gpt-4.1"):
    os.makedirs("evaluations", exist_ok=True)
    with open(f"result/{model}_results.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f if line.strip()]
    eval_results = []
    for idx, item in enumerate(results):
        question = item.get("question")
        answer = item.get("response")
        question_type = item.get("question_type", "factual")
        reference = None
        if "reference" in item:
            reference = (
                item["reference"][0]
                if isinstance(item["reference"], list)
                else item["reference"]
            )
        score, eval_text = evaluate_answer(
            model, question, answer, question_type, reference, prompts, eval_model
        )
        item["evaluation_score"] = score
        item["evaluation_text"] = eval_text
        eval_results.append(item)
        print(f"模型{model} 问题{item.get('question_id')} 评分: {score}")
        # time.sleep(1)  # 防止API限流
    with open(f"evaluations/{model}_evaluation.json", "w", encoding="utf-8") as f:
        for item in eval_results:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print(f"{model} 评估完成，结果已保存到 evaluations/{model}_evaluation.json")

In [4]:
models = [
    "gpt-4o-mini",
    "gpt-3.5-turbo",
    "deepseek-chat",
    "claude-3-5-haiku-20241022",
    "gemini-2.5-flash-preview-04-17",
]

nice = True
for model in models:
    with open(f"result/{model}_results.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f]
    for result in results:
        question = result.get("question")
        answer = result.get("response")
        if answer is None:
            nice = False
            print(f"模型{model} 问题{result.get('question_id')} 没有回答")

if nice:
    print("所有模型都有回答，可以开始评估")

所有模型都有回答，可以开始评估


In [9]:
prompts = load_prompts("prompts.json")
print(prompts)

{'knowledge': {'name': 'knowledge-v1', 'type': 'single', 'system_prompt': 'You are an expert evaluator with deep domain knowledge.', 'prompt_template': "[Instruction]\nYou are evaluating the response to a factual knowledge question. Please assess the accuracy, completeness, and clarity of the answer. Consider whether all key facts are correctly presented, whether the answer addresses all aspects of the question, and whether the explanation is clear and well-structured. After your evaluation, rate the response on a scale of 1-10 by providing your rating in this format: [[rating]].\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", 'description': 'Prompt for evaluating factual knowledge questions', 'category': 'knowledge', 'output_format': '[[rating]]'}, 'multiple_choice': {'name': 'multiple-choice-v1', 'type': 'single', 'system_prompt': 'You are an objective evaluator for multiple-choice questions.', 'prompt_template': "[Instructi

In [12]:
# 评估所有模型
threads = [Thread(target=evaluate_model_result, args=(model, prompts)) for model in models]

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

模型gemini-2.5-flash-preview-04-17 问题101 评分: 9
模型gpt-3.5-turbo 问题101 评分: 9
模型deepseek-chat 问题101 评分: 8
模型claude-3-5-haiku-20241022 问题101 评分: 9
模型gpt-3.5-turbo 问题102 评分: 10
模型deepseek-chat 问题102 评分: 10
模型gemini-2.5-flash-preview-04-17 问题102 评分: 9
模型claude-3-5-haiku-20241022 问题102 评分: 10
模型deepseek-chat 问题103 评分: 10
模型gpt-3.5-turbo 问题103 评分: 10
模型gemini-2.5-flash-preview-04-17 问题103 评分: 9
模型claude-3-5-haiku-20241022 问题103 评分: 9
模型deepseek-chat 问题104 评分: 9
模型deepseek-chat 问题105 评分: 9
模型gpt-3.5-turbo 问题104 评分: 9
模型claude-3-5-haiku-20241022 问题104 评分: 9
模型gpt-3.5-turbo 问题105 评分: 9
模型claude-3-5-haiku-20241022 问题105 评分: 10
模型claude-3-5-haiku-20241022 问题201 评分: 10
模型gemini-2.5-flash-preview-04-17 问题104 评分: 9
模型gpt-3.5-turbo 问题201 评分: 10
模型deepseek-chat 问题201 评分: 10
模型gemini-2.5-flash-preview-04-17 问题105 评分: 10
模型deepseek-chat 问题202 评分: 9
模型gemini-2.5-flash-preview-04-17 问题201 评分: 10
模型gpt-3.5-turbo 问题202 评分: 10
模型claude-3-5-haiku-20241022 问题202 评分: 8
模型deepseek-chat 问题203 评分: 10
模型claude-3-5-haik

以下统计所有模型的分数

In [12]:
for model in models:
    print(f"模型{model}评估结果：")
    with open(f"evaluations/{model}_evaluation.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f]
    tot = 0
    for result in results:
        category = result.get("category")
        type = result.get("question_type")
        score = result.get("evaluation_score")
        print(category, type, score, sep='\t')
        tot += score
    avg = tot / len(results)
    print(f"模型{model}平均分：{avg:.2f}\n")

模型gpt-4o-mini评估结果：
computer_history	factual	8
computer_history	multiple_choice	8
computer_history	instruction	9
computer_history	factual	8
computer_history	multiple_choice	10
discrete_math	multiple_choice	10
discrete_math	factual	10
discrete_math	instruction	10
discrete_math	multiple_choice	9
discrete_math	factual	9
programming	instruction	6
programming	factual	9
programming	multiple_choice	10
programming	instruction	10
programming	multiple_choice	9
artificial_intelligence	factual	9
artificial_intelligence	instruction	10
artificial_intelligence	multiple_choice	10
artificial_intelligence	instruction	9
artificial_intelligence	multiple_choice	8
computer_systems	factual	9
computer_systems	multiple_choice	7
computer_systems	instruction	9
computer_systems	factual	9
computer_systems	multiple_choice	9
模型gpt-4o-mini平均分：8.96

模型gpt-3.5-turbo评估结果：
computer_history	factual	9
computer_history	multiple_choice	10
computer_history	instruction	10
computer_history	factual	9
computer_history	multiple_cho

In [10]:
from collections import defaultdict

def stat_model_scores(model):
    print(f"模型 {model} 详细统计：")
    with open(f"evaluations/{model}_evaluation.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f]
    type_scores = defaultdict(list)
    all_scores = []
    for result in results:
        qtype = result.get("question_type")
        score = result.get("evaluation_score")
        if score is not None:
            type_scores[qtype].append(score)
            all_scores.append(score)
    # 统计每种题型
    for qtype in type_scores:
        scores = type_scores[qtype]
        avg = sum(scores) / len(scores)
        maxv = max(scores)
        minv = min(scores)
        ge8 = sum(1 for s in scores if s >= 8)
        ratio = ge8 / len(scores) * 100
        print(f"{qtype}\t题数:{len(scores)}\t平均分:{avg:.2f}\t最高分:{maxv}\t最低分:{minv}\t≥8分比例:{ratio:.1f}%")
    # 总体
    avg = sum(all_scores) / len(all_scores)
    maxv = max(all_scores)
    minv = min(all_scores)
    ge8 = sum(1 for s in all_scores if s >= 8)
    ratio = ge8 / len(all_scores) * 100
    print(f"总计\t题数:{len(all_scores)}\t平均分:{avg:.2f}\t最高分:{maxv}\t最低分:{minv}\t≥8分比例:{ratio:.1f}%\n")

for model in models:
    stat_model_scores(model)

模型 gpt-4o-mini 详细统计：
factual	题数:8	平均分:8.88	最高分:10	最低分:8	≥8分比例:100.0%
multiple_choice	题数:10	平均分:9.00	最高分:10	最低分:7	≥8分比例:90.0%
instruction	题数:7	平均分:9.00	最高分:10	最低分:6	≥8分比例:85.7%
总计	题数:25	平均分:8.96	最高分:10	最低分:6	≥8分比例:92.0%

模型 gpt-3.5-turbo 详细统计：
factual	题数:8	平均分:9.25	最高分:10	最低分:9	≥8分比例:100.0%
multiple_choice	题数:10	平均分:9.20	最高分:10	最低分:7	≥8分比例:90.0%
instruction	题数:7	平均分:9.00	最高分:10	最低分:7	≥8分比例:85.7%
总计	题数:25	平均分:9.16	最高分:10	最低分:7	≥8分比例:92.0%

模型 deepseek-chat 详细统计：
factual	题数:8	平均分:8.88	最高分:10	最低分:8	≥8分比例:100.0%
multiple_choice	题数:10	平均分:8.80	最高分:10	最低分:3	≥8分比例:90.0%
instruction	题数:7	平均分:9.00	最高分:10	最低分:6	≥8分比例:85.7%
总计	题数:25	平均分:8.88	最高分:10	最低分:3	≥8分比例:92.0%

模型 claude-3-5-haiku-20241022 详细统计：
factual	题数:8	平均分:8.88	最高分:10	最低分:8	≥8分比例:100.0%
multiple_choice	题数:10	平均分:9.50	最高分:10	最低分:6	≥8分比例:90.0%
instruction	题数:7	平均分:9.00	最高分:10	最低分:6	≥8分比例:85.7%
总计	题数:25	平均分:9.16	最高分:10	最低分:6	≥8分比例:92.0%

模型 gemini-2.5-flash-preview-04-17 详细统计：
factual	题数:8	平均分:9.00	最高分:10	最低分:8	≥8分比例:100.0%
multiple_choic

In [11]:
# 领域映射
category_map = {
    "computer_history": "计算机史",
    "discrete_math": "离散数学",
    "programming": "程序设计",
    "artificial_intelligence": "人工智能",
    "computer_systems": "计算机系统"
}

def stat_model_fields(model):
    print(f"模型 {model} 分领域表现：")
    with open(f"evaluations/{model}_evaluation.json", "r", encoding="utf-8") as f:
        results = [json.loads(line) for line in f]
    field_scores = defaultdict(list)
    for result in results:
        cat = result.get("category")
        score = result.get("evaluation_score")
        if score is not None:
            field_scores[cat].append(score)
    # 输出markdown表格行
    row = [model]
    for cat_key in category_map:
        scores = field_scores.get(cat_key, [])
        avg = sum(scores) / len(scores) if scores else 0
        row.append(f"{avg:.2f}")
    print("| " + " | ".join(row) + " |")

print("| 模型/领域 | 计算机史 | 离散数学 | 程序设计 | 人工智能 | 计算机系统 |")
print("|---|---|---|---|---|---|")
for model in models:
    stat_model_fields(model)

| 模型/领域 | 计算机史 | 离散数学 | 程序设计 | 人工智能 | 计算机系统 |
|---|---|---|---|---|---|
模型 gpt-4o-mini 分领域表现：
| gpt-4o-mini | 8.60 | 9.60 | 8.80 | 9.20 | 8.60 |
模型 gpt-3.5-turbo 分领域表现：
| gpt-3.5-turbo | 9.40 | 9.20 | 9.20 | 9.60 | 8.40 |
模型 deepseek-chat 分领域表现：
| deepseek-chat | 9.20 | 9.40 | 8.80 | 9.20 | 7.80 |
模型 claude-3-5-haiku-20241022 分领域表现：
| claude-3-5-haiku-20241022 | 9.40 | 9.60 | 9.00 | 9.40 | 8.40 |
模型 gemini-2.5-flash-preview-04-17 分领域表现：
| gemini-2.5-flash-preview-04-17 | 9.20 | 9.60 | 9.40 | 9.20 | 8.80 |
