In [None]:
import json
from collections import defaultdict


# 读取 JSON 文件内容
with open('Benchmark/license_data/license_llm/term/result/merged_terms.json', 'r') as f:
    data = json.load(f)

# 存储不一致的条款信息
inconsistent_terms = []

term_count = defaultdict(int)

strong_oppose = 0
similar = 0

# 定义 marker 对比规则
similar_pairs = {("CAN", "MUST"), ("CANNOT", "MUST NOT"), ("MUST", "CAN"), ("MUST NOT", "CANNOT")}
strong_oppose_pairs = {("CAN", "CANNOT"), ("MUST", "MUST NOT"), ("CAN", "MUST NOT"), ("MUST NOT", "CAN")}


# 遍历所有许可证
for license_info in data:
    license_name = license_info["license_name"]
    
    # 遍历每个许可证的条款
    for term_info in license_info["terms"]:
        term_name = term_info["term"]
        
        # 获取 deepseek 和 mistral_large 的 marker 值
        deepseek_marker = term_info["deepseek_response"].get("marker") if isinstance(term_info["deepseek_response"], dict) else None
        mistral_marker = term_info["mistral_large_response"].get("marker") if isinstance(term_info["mistral_large_response"], dict) else None
        
        # 检查 marker 是否不一致
        if deepseek_marker and mistral_marker and deepseek_marker != mistral_marker:
            inconsistent_terms.append({
                "license_name": license_name,
                "term_info": term_info,
            })
            
            marker_pair = (deepseek_marker, mistral_marker)
            if marker_pair in similar_pairs or marker_pair[::-1] in similar_pairs:
                similar += 1
            elif marker_pair in strong_oppose_pairs or marker_pair[::-1] in strong_oppose_pairs:
                strong_oppose += 1
            
            term_count[term_name] += 1
            
            # print(term_info['deepseek_response']['marker'], term_info['mistral_large_response']['marker'],license_name)

# 输出不一致的条款信息
print(f"Total inconsistent terms: {len(inconsistent_terms)}")
print(f"similar: {similar}, strong_oppose: {strong_oppose}")

for key, value in term_count.items():
    print(f"{key}: {value}")

with open('Benchmark/license_data/license_llm/term/result/inconsistent_terms.json', 'w') as f:
    json.dump(inconsistent_terms, f, indent=4)


Total inconsistent terms: 379
similar: 135, strong_oppose: 60
Commercial Use: 15
Use Trademark: 130
Disclose Source: 51
Include License: 29
Use Patent Claims: 36
Include Copyright: 9
Sublicense: 5
Distribute: 18
Give Credit: 15
Rename: 18
Modify: 18
Hold Liable: 5
Relicense: 18
Statically Link: 1
Contact Author: 5
Include Install Instructions: 2
Private Use: 4
