In [None]:
import json
import re

deepseek_re = 'Benchmark/license_data/license_llm/term/logging/2024-10-29-deepseek.log'
mistral_re = 'Benchmark/license_data/license_llm/term/logging/2024-10-28.log'

merged_terms = 'Benchmark/license_data/license_llm/term/result/merged_terms.json'

# 解析带格式的JSON字符串为字典
def parse_json_string(json_string):
    try:
        # 去除多余的标记字符
        json_string = re.sub(r'```json|```$', '', json_string).strip()
        return json.loads(json_string)
    except json.JSONDecodeError:
        return json_string  # 返回原始字符串以防无法解析

# 读取deepseek和mistral large数据
with open(deepseek_re, 'r') as f:
    deepseek_data = [json.loads(line) for line in f]

with open(mistral_re, 'r') as f:
    mistral_data = [json.loads(line) for line in f]

# 合并数据

i=0

merged_data = {}
for entry in deepseek_data + mistral_data:
    i+=1
    license_name = entry["license_name"]
    term = entry["license_terms"]
    response_key = "deepseek_response" if "deepseek_response" in entry else "mistral_large_response"
    response_value = parse_json_string(entry[response_key])

    # 初始化每个许可证和条款的结构
    if license_name not in merged_data:
        merged_data[license_name] = {"license_name": license_name, "terms": {}}

    if term not in merged_data[license_name]["terms"]:
        merged_data[license_name]["terms"][term] = {"term": term, "deepseek_response": "None", "mistral_large_response": "None"}

    # 设置对应的响应信息
    merged_data[license_name]["terms"][term][response_key] = response_value

# 转换为目标格式
final_data = []
for license_info in merged_data.values():
    license_info["terms"] = list(license_info["terms"].values())
    final_data.append(license_info)

# 写入结果到JSON文件
with open(merged_terms, 'w') as f:
    json.dump(final_data, f, indent=4)

