In [2]:
import pandas as pd
import json
import random

# 文件路径
file_path = "data.xlsx"  # 请替换为实际的文件路径

# 读取数据
df = pd.read_excel(file_path)

# 定义问题模板
templates = [
    ("单跳属性", "这门课多少学分？", "credit", "精确属性检索", "{course}课程的学分为 {value} 学分。"),
    ("单跳属性", "这门课总学时是多少？", "period", "精确属性检索", "{course}课程的总学时为 {value} 学时。"),
    ("单跳属性", "这门课的考核方式是什么？", "examCategory", "精确属性检索", "{course}课程的考核方式为 {value}。"),
    
    ("二跳推理", "这门课的教材有哪些？", "basicTextbook", "链路推理", "{course}课程的教材包括：{value}。"),
    ("二跳推理", "这门课的参考书有哪些？", "bibliography", "链路推理", "{course}课程的参考书包括：{value}。"),
    
    ("模糊表达", "介绍下这门课程的内容？", "curriculumDescription", "模糊表达识别", "{course}课程的主要内容是：{value}"),
    
    ("章节内容", "这门课的知识目标是什么？", "curriculumTarget", "章节内容提取", "{course}课程的知识目标是：{value}"),
    
    ("无效或泛问", "这课难不难？", "无具体答案", "泛问处理", "无具体答案")
]

# 构建测试集
test_data = []
for _, row in df.iterrows():
    course_name = row["curriculumName"]

    for template in templates:
        category, question_template, column, behavior, answer_template = template
        
        # 处理无具体答案情况
        if column == "无具体答案":
            expected_answer = "无具体答案"
        else:
            value = row.get(column, "未知")
            if pd.isna(value) or value == "":
                value = "未知"

            # 构建答案
            expected_answer = answer_template.format(course=course_name, value=value)

        # 构建问题
            question = question_template.replace("这门课", str(course_name))

        # 添加到测试集
        test_data.append({
            "category": category,
            "question": question,
            "expected_behavior": behavior,
            "expected_answer": expected_answer
        })

# 保存为 JSON 文件
output_path = "test_dataset.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"测试数据集已生成并保存为：{output_path}")


测试数据集已生成并保存为：test_dataset.json


In [3]:
import pandas as pd
import json

# 文件路径
file_path = "/media/zhjk/rmx/medical/tcm_graph/KGLLM/data/course_chapter.xlsx"

# 读取数据
df = pd.read_excel(file_path)

# 定义问题模板
templates = [
    ("知识点概念", "{name}的概念是什么？", "knowledgePointConcept", "知识点概念查询", "{name}的概念是：{value}"),
    ("章节内容", "{curriculum}课程中的章节有哪些？", "unit", "章节内容查询", "{curriculum}课程中的章节包括：{value}"),
    ("知识点列表", "{curriculum}课程中的知识点有哪些？", "knowledgePointName", "知识点列表示例", "{curriculum}课程中的知识点包括：{value}"),
]

# 构建测试集
test_data = []
for _, row in df.iterrows():
    knowledge_point = str(row["knowledgePointName"])
    concept = str(row["knowledgePointConcept"])
    unit = str(row["unit"])
    curriculum = str(row["curriculum"])

    for template in templates:
        category, question_template, column, behavior, answer_template = template

        # 构建答案
        if column == "knowledgePointConcept":
            value = concept
            question = question_template.format(name=knowledge_point)
            expected_answer = answer_template.format(name=knowledge_point, value=value)

        elif column == "unit":
            value = unit
            question = question_template.format(curriculum=curriculum)
            expected_answer = answer_template.format(curriculum=curriculum, value=value)

        elif column == "knowledgePointName":
            knowledge_points = ", ".join(df[df["curriculum"] == curriculum]["knowledgePointName"].tolist())
            value = knowledge_points
            question = question_template.format(curriculum=curriculum)
            expected_answer = answer_template.format(curriculum=curriculum, value=value)

        # 添加到测试集
        test_data.append({
            "category": category,
            "question": question,
            "expected_behavior": behavior,
            "expected_answer": expected_answer
        })

# 保存为 JSON 文件
output_path = "knowledge_point_test_dataset.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"测试数据集已生成并保存为：{output_path}")


测试数据集已生成并保存为：knowledge_point_test_dataset.json


In [None]:
import json
import os

# 测试数据集路径
test_data_path = "/mnt/data/knowledge_point_test_dataset.json"

# 模拟系统回答
def generate_response(question):
    """
    模拟系统回答，可以替换为真实系统接口调用。
    这里直接返回 expected_answer 进行模拟。
    """
    return "模拟回答：无法提供真实回答。"  # 可以替换为真实的系统调用

# 测评函数
def evaluate_test_data(test_data_path):
    # 读取测试数据集
    with open(test_data_path, "r", encoding="utf-8") as f:
        test_data = json.load(f)

    total_questions = len(test_data)
    correct_answers = 0
    matched_answers = 0
    missed_answers = 0

    # 遍历测试数据集
    for item in test_data:
        question = item["question"]
        expected_answer = item["expected_answer"]
        category = item["category"]

        # 模拟回答生成
        actual_answer = generate_response(question)

        # 统计结果
        if actual_answer == expected_answer:
            matched_answers += 1

        if "无法提供真实回答" in actual_answer:
            missed_answers += 1
        else:
            correct_answers += 1

        print(f"【类别】: {category}")
        print(f"【问题】: {question}")
        print(f"【期望回答】: {expected_answer}")
        print(f"【实际回答】: {actual_answer}")
        print(f"【匹配】: {'✅' if actual_answer == expected_answer else '❌'}\n")

    # 计算测评指标
    accuracy = correct_answers / total_questions
    match_rate = matched_answers / total_questions
    miss_rate = missed_answers / total_questions

    # 打印测评结果
    print("\n=== 测评结果 ===")
    print(f"总问题数量: {total_questions}")
    print(f"正确回答数量: {correct_answers}")
    print(f"完全匹配数量: {matched_answers}")
    print(f"未命中数量: {missed_answers}")
    print(f"准确率 (Accuracy): {accuracy:.2f}")
    print(f"匹配率 (Match Rate): {match_rate:.2f}")
    print(f"未命中率 (Miss Rate): {miss_rate:.2f}")

# 执行测评
if __name__ == "__main__":
    if os.path.exists(test_data_path):
        evaluate_test_data(test_data_path)
    else:
        print(f"测试数据集文件未找到：{test_data_path}")
