In [None]:
import re
import json
import os
import csv

# 文件路径设置
license_filename = 'Benchmark/license_data/license_extract/has_spdx_spdx.json'
license_dir_path = 'Benchmark/license_data/license_json'
output_json_path = 'Benchmark/license_data/license_extract/license_terms_spdx3.json'

terms_pattern = 'Benchmark/license_data/license_info/terms_pattern.csv'
terms_patterns = {}

# 读取CSV文件
with open(terms_pattern, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        term = row["Term"].strip()
        description = row["Pattern"].strip()
        
        # 将符号 "|" 替换为正则表达式的格式
        pattern = re.sub(r'\|', '|', description)
        
        terms_patterns[term] = fr"({pattern})"
        
        
# # 定义待提取的Terms关键词及其正则模式
# terms_patterns = {
#     "Distribute": r"(distribute|distribution|redistribute)",
#     "Modify": r"(modify|change|modification|derivative works)",
#     "Commercial Use": r"(commercial use|sell|offer of sale|resale|use for commercial purpose)",
#     "Hold Liable": r"(liability|liable|without any warranty|responsibility)",
#     "Include Copyright": r"(copyright|retain copyright)",
#     "Include License": r"(include license|license text|copy this License|permission notice)"
# }


# 检查输出文件的目录是否存在
os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

# 读取包含许可证信息的 JSON 文件
with open(license_filename, 'r', encoding='utf-8') as f:
    license_sum = json.load(f)

extracted_sum = []

# 处理每个许可证文件
for item in license_sum:
    filename = item['license_summary']['license_information']['filename'].split('.license')[0]
    license_file_path = os.path.join(license_dir_path, filename + '.json')

    # 读取许可证文件文本
    with open(license_file_path, 'r', encoding='utf-8') as f:
        license_all = json.load(f)
        license_text = license_all['license_body'].split('.')

    # 初始化结果字典并添加许可证名称
    extracted_terms = {
        "license_name": filename + '.license',
        "terms": {term: {"lines": [], "content": ""} for term in terms_patterns.keys()}
    }

    # 遍历每行文本并检查是否匹配任意Term的模式
    for line_number, line in enumerate(license_text, start=1):
        for term, pattern in terms_patterns.items():
            if re.search(pattern, line, re.IGNORECASE):
                extracted_terms["terms"][term]["lines"].append(line_number)
                extracted_terms["terms"][term]["content"] += line.strip() + ". \n\n"

    extracted_sum.append(extracted_terms)

print(extracted_sum)

# 将所有许可证的提取内容保存到一个JSON文件中
with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(extracted_sum, json_file, ensure_ascii=False, indent=4)

print(f"Terms have been extracted and saved to {output_json_path}")
