In [None]:
import requests
import json
import re
import os
import pandas as pd
import logging
import time
from openai import OpenAI


def Prompt_Gen(case):
    #addtion = ""
    return f"""## Instruction:

### Task Description:
You are assigned to determine and interpret the presence and meaning of a specific licensing term within a portion of an open-source software license. Instead of reviewing the entire license, you will be provided with only those sections that are directly relevant to the specified term, along with some contextual license information. Your goal is to analyze these excerpts and identify whether they address the term and, if so, the term's implications for users according to predefined categories.

To classify the term's meaning, you may only use the following Simplified Markers: CAN, CANNOT, MUST, MUST NOT, OPTIONAL, UNCLEAR, and NOT SPECIFIED. Your analysis should strictly adhere to this set of markers without any additional terminology to ensure consistency. If the meaning is ambiguous, or if the term is absent, you must still choose one of the above markers and provide a supporting explanation using the provided license content only.

### Simplified Markers for License Terms

#### Empowering Clauses: For terms that grant permissions (e.g., Distribute, Modify, Commercial Use, Relicense, Hold Liable, Use Patent Claims, Sublicense, Use Trademark), use the following markers:
- CAN: If the license explicitly permits the action.
- MUST NOT: If the license explicitly prohibits the action.

#### Responsibility Clauses: For terms that impose obligations (e.g., Include Copyright, Disclose Source, Give Credit, Rename, Contact Author), use these markers:
- MUST: If the license mandates the action or condition.
- OPTIONAL: If the license suggests the action but does not require it.

#### Ambiguous Cases: If the term is mentioned but lacks clear implications regarding permissions or obligations, or if it is entirely absent, use:
- NOT SPECIFIED: Indicating that the license does not explicitly address the term or its implications.

## Input:
The following structured information will be provided to help determine the status of the term in the license:

- License Information:
  - License Name: {case['license_name']}.
  - License Category: {case['license_info']['category']}.

- Specific Term to Evaluate:
  - Term Name: {case['license_terms']}.
  - Term Description: {case['terms_description']}.

- License Content for Analysis:
  - Relevant Lines: {case['content_lines']}.
  - Content Excerpts: {case['license_content']}.


## Output:

### Expected Output Format:
Generate a JSON object with the following keys:
- term: The name of the evaluated term.
- marker: One of the following values: "CAN", "MUST", "MUST NOT", "OPTIONAL", or "NOT SPECIFIED".
- explanation: A brief statement containing the line number and a relevant phrase or keyword from the license content that supports the marker selection.

### Example Output:
```json
{{
  "term": "Modify",
  "marker": "CAN",
  "explanation": "12: 'may modify and distribute derivative works.'"
}}
```

### Notes:
- Ensure that the marker strictly adheres to one of the specified values.
- The explanation should focus on providing a clear, concise justification for the marker using specific references from the license content.

"""


def Modeling(prompt):
    client = OpenAI(api_key="sk-61de429c43324f1e90a41e8610637a5e", base_url="https://api.deepseek.com")

    response = client.chat.completions.create(
        model="deepseek-chat",
         messages=[
            {"role": "system", "content": "You are a license analysis expert. Follow instructions precisely to interpret the license terms."},
            {"role": "user", "content": prompt}
        ],
        stream=False
    )
    
    print(response)
    
    return response.choices[0].message.content

def process_a_case(license_case):
    
    global case_num
    
    print(f"Processing license: {license_case['license_name']}. Term: {license_case['license_terms']}     $$$ PLEASE WAITING $$$")
    prompt = Prompt_Gen(license_case)
    #print(prompt)
    
    if (len(prompt.encode('utf-8'))) > 128000:
        return
    
    if license_case['license_content'] == '':
        response = 'None'
    else:
        # 大模型判断
        #response = ''
        response = Modeling(prompt)
        case_num += 1

    result_dict = {
        "license_name": license_case['license_name'],
        'license_terms': license_case['license_terms'],
        "response": response,
    }
    
    json_data = json.dumps(result_dict, ensure_ascii=False)   
    
    print(json_data)
    
    with open(log_path, 'a+', encoding='utf-8') as f:
        f.write(json_data + '\n')


def is_license_processed(license_name, license_terms, log_path):
    """检查指定许可证是否已经在日志中处理过"""
    if not os.path.exists(log_path):
        return False
    
    with open(log_path, 'r', encoding="utf-8") as log_file:
        current_json = ""
        inside_json = False

        # 按行读取文件，逐行解析 JSON 对象
        for line in log_file:
            line = line.strip()
            
            # 检测是否是 JSON 对象的开头, 尝试解析单行 JSON
            if line.startswith("{") and line.endswith("}"):
                try:
                    json_data = json.loads(line)
                    if json_data.get("license_name") == license_name and json_data.get("license_terms") == license_terms:
                        return True
                except json.JSONDecodeError as e:
                    print(f"Failed to decode JSON: {e}")
                    
                    # # 如果解析失败，进入多行模式
                    # inside_json = True
                    # current_json = line
            
            # 多行拼接 JSON 对象
            # elif inside_json:
            #     current_json += line
            #     if line.endswith("}"):
            #         try:
            #             # 尝试解析完整的多行 JSON
            #             json_data = json.loads(current_json)
            #             if json_data.get("license_name") == license_name and json_data.get("license_terms") == license_terms:
            #                 return True
            #             inside_json = False  # 重置状态
            #         except json.JSONDecodeError:
            #             # 继续拼接，直到完成 JSON
            #             continue
    
    return False


case_num = 0

if __name__ == '__main__':    
    log_path = "/home/keqiang/Benchmark/license_data/license_llm/prompt/logging/2024-11-5-context.log"
    license_terms_spdx = '/home/keqiang/Benchmark/license_data/license_extract/license_terms_spdx.json'
    license_info_spdx = '/home/keqiang/Benchmark/license_data/license_extract/has_spdx_spdx.json'
    terms = '/home/keqiang/Benchmark/license_data/license_info/terms.csv'
    license_json = '/home/keqiang/Benchmark/license_data/license_json'
    
    
    
    with open(license_terms_spdx, mode='r', encoding="utf-8") as f:
        license_terms_data = json.load(f)
        
    with open(license_info_spdx, mode='r', encoding="utf-8") as f:
        license_info_data = json.load(f)

    df = pd.read_csv(terms)
    
    #flag = 2
    
    for license_items in license_terms_data:
        license_name = license_items['license_name']
        terms = license_items['terms']
        
        name = license_name.split('.license')[0]
        license_path = os.path.join(license_json, name + '.json')
        with open(license_path, mode='r', encoding="utf-8") as f:
            license_json_data = json.load(f)
        license_body = license_json_data['license_body']
        
        for license_terms, details in terms.items():
            content_lines = details["lines"]
            license_content = details["content"]
            
            terms_description = df.loc[df['Term'] == license_terms, 'Description'].values
            
            license_summary = next((item["license_summary"] for item in license_info_data 
                    if item["license_summary"]["license_information"]["filename"] == license_name), None)
            license_info = license_summary['license_information']

            # 检查日志文件中是否已处理该许可证
            if is_license_processed(license_name, license_terms, log_path):
                print(f"Skipping {license_name}, {license_terms}, already processed.")
                continue

            # 将扫描结果和license信息整合到字典中
            license_case = {
                'license_info': license_info,               # 许可证的文件信息
                'license_name':license_name,                # 许可证名称，含有.license
                'license_terms': license_terms,             # 需要判定的具体条款
                'content_lines': content_lines,             # 条款在许可证文本中的定位
                'license_content': license_content,         # 许可证文本中与条款相关的语句
                'terms_description': terms_description,      # 条款的具体解释
                'license_body': license_body
            }
            
            if case_num == 100:
                exit(0)
            
            process_a_case(license_case)




Processing license: bsd-zero.license. Term: Distribute     $$$ PLEASE WAITING $$$
ChatCompletion(id='7124211f-9a9b-4c79-9208-d0454c70aed5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{\n  "term": "Distribute",\n  "marker": "CAN",\n  "explanation": "1: \'Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted.\'"\n}\n```', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1730821961, model='deepseek-chat', object='chat.completion', service_tier=None, system_fingerprint='fp_1c141eb703', usage=CompletionUsage(completion_tokens=59, prompt_tokens=764, total_tokens=823, completion_tokens_details=None, prompt_tokens_details=None, prompt_cache_hit_tokens=0, prompt_cache_miss_tokens=764))
{"license_name": "bsd-zero.license", "license_terms": "Distribute", "response": "```json\n{\n  \"term\": \"Distribute\",\n  \"marker\": \"C

KeyboardInterrupt: 

: 