In [None]:
import json

# 输入文件路径
all_terms_file = 'Benchmark/license_data/license_llm/term/result/merged_terms.json'
manual_check_file = 'Benchmark/license_data/license_llm/term/result/terms_manul.json'

# 输出文件路径
output_file = 'Benchmark/license_data/license_llm/term/result/terms_dataset.json'

# 读取所有条款的模型输出文件
with open(all_terms_file, 'r') as f:
    all_terms_data = json.load(f)

# 读取人工检查文件
with open(manual_check_file, 'r') as f:
    manual_check_data = json.load(f)

# 创建一个字典用于快速查找人工检查的结果
manual_check_dict = {
    (entry['license_name'], entry['term_info']['term']): entry['term_info']
    for entry in manual_check_data
}

# 存储最终结果
final_output = []

# 处理条款
for license_info in all_terms_data:
    license_name = license_info['license_name']
    terms = license_info['terms']

    # 存储当前许可证的处理结果
    license_output = {
        "license_name": license_name,
        "terms": []
    }

    for term_info in terms:
        term = term_info['term']
        deepseek_response = term_info['deepseek_response']
        mistral_large_response = term_info['mistral_large_response']

        # 检查是否有模型输出
        if deepseek_response != "None" and mistral_large_response != "None":
            # 如果两个模型输出相同，直接记录
            deepseek_valid = isinstance(deepseek_response, dict)
            mistral_valid = isinstance(mistral_large_response, dict)
            if deepseek_valid == False or mistral_valid == False:
                print(license_name, term)
                continue
                
            if deepseek_response['marker'] == mistral_large_response['marker']:
                license_output['terms'].append({
                    "term": term,
                    "marker": deepseek_response['marker'],
                    "explanation": deepseek_response['explanation']
                })
            else:
                # 如果模型输出不同，查找人工检查的结果
                manual_check_key = (license_name, term)
                if manual_check_key in manual_check_dict:
                    # 将人工检查的结果合并
                    manual_result = manual_check_dict[manual_check_key]
                    response = manual_result.get('deepseek_response') or manual_result.get('mistral_large_response')
                    if response:
                        license_output['terms'].append({
                            "term": term,
                            "marker": response['marker'],
                            "explanation": response['explanation']
                        })
                    else:
                        print(f"No valid response found in manual check for {license_name} - {term}")
                else:
                    print(f"No manual check found for {license_name} - {term}")
        elif deepseek_response != "None":
            # 仅有 deepseek 的结果
            license_output['terms'].append({
                "term": term,
                "marker": deepseek_response['marker'],
                "explanation": deepseek_response['explanation']
            })
        elif mistral_large_response != "None":
            # 仅有 mistral_large 的结果
            license_output['terms'].append({
                "term": term,
                "marker": mistral_large_response['marker'],
                "explanation": mistral_large_response['explanation']
            })

    # 添加当前许可证结果到最终输出
    final_output.append(license_output)

# 将最终结果写入输出文件
with open(output_file, 'w') as f:
    json.dump(final_output, f, indent=4)

print("Processing complete. Final output file generated.")


Processing complete. Final output file generated.
