In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
import re

# 假设你有一个自定义的大模型接口
from llm import get_llm_answers

model_name = "deepseek-chat"

###############################
# 1) 第一步：解析类 & 函数行号
###############################
def get_structure_prompt(code_text: str, program_language: str) -> str:
    """
    让大模型输出文件内的类、函数的嵌套结构和行号范围。
    """
    # 把代码行打包成 JSON，方便 LLM 在回答时索引
    lines = code_text.splitlines()
    lines_json = [
        {"line": i + 1, "code": line} for i, line in enumerate(lines)
    ]

    prompt = f"""
You are given a piece of {program_language} code. 
Your task: find all the nested classes and methods in the code, 
including their exact start_line and end_line.

Return a JSON with this structure:

{{
  "name": "example_script", 
  "type": "CFG",
  "start_line": 1,
  "end_line": {len(lines)},
  "functions": [
    {{
      "name": "function_name",
      "type": "function",
      "start_line": 10,
      "end_line": 20,
      "functions": [],
      "classes": []
    }}
  ],
  "classes": [
    {{
      "name": "class_name",
      "type": "class",
      "start_line": 30,
      "end_line": 50,
      "functions": [
        {{
          "name": "method_name",
          "type": "function",
          "start_line": 35,
          "end_line": 40,
          "functions": [],
          "classes": []
        }}
      ],
      "classes": []
    }}
  ]
}}

IMPORTANT:
1) If a function is nested in a class, put it in the class's "functions".
2) If a class is nested in another class, put it in "classes".
3) start_line/end_line must reflect the actual lines in the original code.
4) Do not omit or reorder fields. Use the exact structure above.

Code lines:
{json.dumps(lines_json, indent=2)}
"""
    return prompt


def find_structure(code_text: str, program_language="python"):
    """
    调用大模型获取文件中的类、函数及其行范围
    """
    prompt = get_structure_prompt(code_text, program_language)
    response = get_llm_answers(prompt, model_name=model_name, require_json=True)
    structure = json.loads(response)
    return structure


###############################
# 2) 第二步：为某段代码生成“树状”CFG
###############################
def get_tree_cfg_prompt(code: str, program_language: str, block_name: str) -> str:
    """
    让大模型基于一段完整的代码生成“树状”CFG，保留完整代码行，不做简化。
    successors 里直接嵌套 block 对象（非 ID）。
    """
    lines = code.splitlines()
    lines_json = [
        {"line": i + 1, "code": line} for i, line in enumerate(lines)
    ]
    # 示例 JSON 的字符串，直接内嵌到 Prompt 中
    example_json = """{
  "name": "my_block",
  "type": "CFG",
  "blocks": [
    {
      "id": 1,
      "start_line": 1,
      "end_line": 10,
      "successors": [
        {
          "id": 2,
          "start_line": 11,
          "end_line": 20,
          "successors": []
        }
      ]
    }
  ],
  "functions": [
    {
      "name": "my_function",
      "type": "CFG",
      "blocks": [
        {
          "id": 1,
          "start_line": 1,
          "end_line": 10,
          "successors": []
        }
      ],
      "functions": [],
      "classes": []
    }
  ],
  "classes": []
}"""

    prompt = f"""
You are given a piece of {program_language} code, and you must produce a CFG in **tree style**:
- The top-level JSON must be:
  {{
    "name": "{block_name}",
    "type": "CFG",
    "blocks": [...],
    "functions": [...],
    "classes": [...]
  }}
- "blocks" is an array of basic blocks. Each block is:
  {{
    "id": <int>,
    "start_line": <int>,
    "end_line": <int>,
    "successors": [ <nested block objects> ]
  }}
  i.e., the "successors" array must be the actual nested blocks, not integer IDs.
- "functions" is an array of nested function CFGs, each of which has the same structure 
  (with "blocks", "functions", "classes").
- "classes" likewise.

DO NOT remove or simplify any lines from the code. The label must keep the code exactly. 
If there's an 'if' or loop or branch, you can split it into multiple blocks, 
but the "successors" must be nested sub-blocks in tree form, 
and not references by ID.

Attention to these situations which might produce multiple blocks:
1) if/else/elif/else
2) for/while/else
3) try/except/else/finally
4) with/else
5) switch/case/default

Example output (shortened):
{example_json}

Now, here is the code you should analyze:
------------------------------------
{json.dumps(lines_json, indent=2)}
------------------------------------

Follow the structure carefully. Output only JSON.
"""
    return prompt

def get_code_by_line_range(code_block, code, block_type: str = "block"):
    """
    从 code 中提取 [start_line, end_line] 这一段代码，
    但会把本 block 内部子级的 class、function 的行号都排除掉，
    最终得到 label。
    """
    code_lines = code.splitlines()
    start_line = code_block["start_line"]
    # 因为 range() 的特性，end_line+1 可以覆盖到 end_line 那行
    end_line = code_block["end_line"]

    if block_type == "class" or block_type == "function":
        start_line += 1

    # 先构建所有行的集合
    line_set = set(range(start_line, end_line + 1))

    # 减去所有子函数行号范围
    for func in code_block.get("functions", []):
        func_start_line = func.get("start_line", 0)
        func_end_line = func.get("end_line", 0)
        line_set.difference_update(range(func_start_line, func_end_line))

    # 减去所有子类行号范围
    for cls in code_block.get("classes", []):
        cls_start_line = cls.get("start_line", 0)
        cls_end_line = cls.get("end_line", 0)
        line_set.difference_update(range(cls_start_line, cls_end_line+1))

    # 将剩下的行号排序
    ordered_lines = sorted(list(line_set))
    # 拼接
    sum_code = "\n".join([code_lines[i - 1] for i in ordered_lines])

    sum_code = re.sub(r'\n+', '\n', sum_code).strip()

    # 存到 code_block 里
    code_block["label"] = sum_code

def build_tree_cfg_for_block(block_info: dict, code_text: str, program_language: str, block_type: str = "block") -> dict:
    # 先对这个 block 做“行号剥离”操作
    get_code_by_line_range(block_info, code_text, block_type)
    full_code_segment = block_info["label"]

    # 让大模型生成 CFG
    block_name = block_info.get("name", "anonymous_block")
    prompt = get_tree_cfg_prompt(full_code_segment, program_language, block_name)
    response_text = get_llm_answers(prompt, model_name=model_name, require_json=True)
    partial_cfg = json.loads(response_text)

    # 递归处理 partial_cfg["blocks"] 及其所有 successors
    def update_block_tree(block: dict, local_code: str):
        get_code_by_line_range(block, local_code)
        for succ in block.get("successors", []):
            update_block_tree(succ, local_code)

    for b in partial_cfg.get("blocks", []):
        update_block_tree(b, full_code_segment)

    # 继续递归处理子函数、子类
    final_functions = []
    for func_info in block_info.get("functions", []):
        sub_cfg = build_tree_cfg_for_block(func_info, code_text, program_language, "function")
        final_functions.append(sub_cfg)

    final_classes = []
    for cls_info in block_info.get("classes", []):
        sub_cfg = build_tree_cfg_for_block(cls_info, code_text, program_language, "class")
        final_classes.append(sub_cfg)

    partial_cfg["functions"] = final_functions
    partial_cfg["classes"] = final_classes

    return partial_cfg

#########################
# 3) 处理单个文件
#########################
def process_file(file_path: str, program_language="python") -> dict:
    """
    - 读取文件内容
    - 第1步：解析嵌套结构(类 / 函数及其行号)
    - 第2步：将顶层解析成 CFG，再递归处理子函数、子类，得到树状结果
    """
    with open(file_path, "r", encoding="utf-8") as f:
        code_text = f.read()

    # (A) 获取文件整体结构
    structure = find_structure(code_text, program_language)
    # structure 里是最顶层 { name, type, start_line, end_line, functions, classes }

    # (B) 构建树状 CFG
    # 注意：最顶层也算一个 block_info
    top_cfg = build_tree_cfg_for_block(structure, code_text, program_language)

    return top_cfg


#########################
# 4) 批量处理
#########################
def main():
    # 输出路径
    output_dir = "tree_cfg_output"
    os.makedirs(output_dir, exist_ok=True)

    # 假设源代码都在 source_code/ 下
    source_dir = "source_code"
    files = os.listdir(source_dir)

    # 这里仅示例处理前 50 个 .py 文件
    py_files = [f for f in files if f.endswith(".py")][:5]

    def process_single_file(filename):
        input_path = os.path.join(source_dir, filename)
        output_path = os.path.join(output_dir, filename.replace(".py", ".json"))
        # if os.path.exists(output_path):
        #     print(f"Skipping {filename}, already processed.")
        #     return

        print(f"Processing {filename} ...")

        final_cfg = process_file(input_path, "python")
        # 写入 JSON
        with open(output_path, "w", encoding="utf-8") as wf:
            json.dump(final_cfg, wf, indent=2)

    process_single_file("60.py")

    # 并行处理
    # with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    #     executor.map(process_single_file, py_files)

if __name__ == "__main__":
    main()

Processing 60.py ...


In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
import re
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor

# 假设你有一个自定义的大模型接口
from llm import get_llm_answers

model_name = "gpt-4o"
###############################
# 1) 第一步：解析类 & 函数行号
###############################
def get_structure_prompt(code_text: str, program_language: str) -> str:
    lines = code_text.splitlines()
    lines_json = [{"line": i + 1, "code": line} for i, line in enumerate(lines)]

    prompt = f"""
You are given a piece of {program_language} code. 
Your task: find all the nested classes and methods in the code, 
including their exact start_line and end_line.

Return a JSON with this structure:

{{
  "name": "example_script", 
  "type": "CFG",
  "start_line": 1,
  "end_line": {len(lines)},
  "functions": [
    {{
      "name": "function_name",
      "type": "function",
      "start_line": 10,
      "end_line": 20,
      "functions": [],
      "classes": []
    }}
  ],
  "classes": [
    {{
      "name": "class_name",
      "type": "class",
      "start_line": 30,
      "end_line": 50,
      "functions": [
        {{
          "name": "method_name",
          "type": "function",
          "start_line": 35,
          "end_line": 40,
          "functions": [],
          "classes": []
        }}
      ],
      "classes": []
    }}
  ]
}}

IMPORTANT:
1) If a function is nested in a class, put it in the class's "functions".
2) If a class is nested in another class, put it in "classes".
3) start_line/end_line must reflect the actual lines in the original code (1-based).
4) Do not omit or reorder fields. Use the exact structure above.

Code lines:
{json.dumps(lines_json, indent=2)}
"""
    return prompt


def find_structure(code_text: str, program_language="python"):
    prompt = get_structure_prompt(code_text, program_language)
    response = get_llm_answers(prompt, model_name=model_name, require_json=True)
    structure = json.loads(response)
    return structure


###############################
# 2) 第二步：为某段代码生成“树状”CFG
###############################
def get_tree_cfg_prompt(code: str, program_language: str, block_name: str) -> str:
    """
    让大模型基于一段完整的代码生成“树状”CFG，保留完整代码行，不做简化。
    successors 里直接嵌套 block 对象（非 ID）。
    """
    lines = code.splitlines()
    lines_json = [{"line": i + 1, "code": line} for i, line in enumerate(lines)]

    example_json = """{
  "name": "my_block",
  "type": "CFG",
  "blocks": [
    {
      "id": 1,
      "start_line": 1,
      "end_line": 10,
      "successors": [
        {
          "id": 2,
          "start_line": 11,
          "end_line": 20,
          "successors": []
        }
      ]
    }
  ],
  "functions": [
    {
      "name": "my_function",
      "type": "CFG",
      "blocks": [
        {
          "id": 1,
          "start_line": 1,
          "end_line": 10,
          "successors": []
        }
      ],
      "functions": [],
      "classes": []
    }
  ],
  "classes": []
}"""

    prompt = f"""
You are given a piece of {program_language} code, and you must produce a CFG in **tree style**:
- The top-level JSON must be:
  {{
    "name": "{block_name}",
    "type": "CFG", 
    "blocks": [...],
    "functions": [...],
    "classes": [...]
  }}
- "blocks" is an array of basic blocks. Each block is:
  {{
    "id": <int>,
    "start_line": <int>, 
    "end_line": <int>,
    "successors": [ <nested block objects> ]
  }}
- "functions" is an array of nested function CFGs, each with the same structure.
- "classes" likewise.

Keep the code lines exactly as is.
When splitting code into blocks, follow these rules:
1) For if statements:
   - The if condition in the block A
   - The if body in the successors of the block A
   - The elif/else bodies in the successors of the block A
2) For loops:
   - The loop header (for/while) in the block A
   - The loop body in the successors of the block A
   - The else clause in the successors of the block A
3) For try/except:
   - The try in the block A
   - The try body in the successors of the block A
   - The except clauses in the successors of the block A
   - The else in the successors of the block A
   - The finally in the successors of the block A
4) For with statements:
   - The with line in the block A
   - The with body in the successors of the block A
5) For switch/case:
   - The switch in the block A
   - The each case/default in the successors of the block A

6) For yield statements and so on:
   - The yield statement in the block A
   - The yield body in the successors of the block A

You must split the code into blocks exactly as described above.

Example output (shortened):
{example_json}

Now, here is the code you should analyze:
------------------------------------
{json.dumps(lines_json, indent=2)}
------------------------------------

Follow the structure carefully. Output only JSON.
"""
    return prompt


##############################
# 剥离子函数/子类范围的函数
##############################
def get_code_by_line_range(block_info: dict, code: str, block_type: str = "block"):
    """
    从 code 中提取 [start_line..end_line] 行；移除所有子 class/function 的行；
    并将连续空行合并为1行，然后存到 block_info["label"]。
    """
    code_lines = code.splitlines()

    start_line = block_info["start_line"]
    end_line = block_info["end_line"]

    if block_type in ("class", "function"):
        # 有时希望类或函数定义的第一行(如 'class XXX:' 或 'def yyy:')
        # 不包含在 label 里，可加一些偏移
        # 或者你想保留也行，看需求
        start_line += 0

    # 构建本块的行号集合
    line_set = set(range(start_line, end_line + 1))

    # 删去所有子函数行号范围
    for func in block_info.get("functions", []):
        fs = set(range(func.get("start_line", 0), func.get("end_line", 0) + 1))
        line_set.difference_update(fs)

    # 删去所有子类行号范围
    for cls in block_info.get("classes", []):
        cs = set(range(cls.get("start_line", 0), cls.get("end_line", 0) + 1))
        line_set.difference_update(cs)

    # 排序并拼成字符串
    ordered_lines = sorted(list(line_set))
    sum_code = "\n".join([code_lines[i - 1] for i in ordered_lines])
    # 合并多余空行
    sum_code = re.sub(r"\n+", "\n", sum_code).strip()

    block_info["label"] = sum_code


##############################
# 更新 block 及其 successors
##############################
def update_block_tree(block: dict, local_code: str) -> set:
    """
    递归地处理一个 block:
    1) 先获取该 block 的行号集合 = [start_line..end_line].
    2) 对所有 successors 做递归调用, 收集它们用到的行号 all_succ_lines.
    3) 从本 block 的行号集合中剔除 all_succ_lines, 得到 exclusive_lines.
    4) 拼接成 label, 存入 block["label"].
    5) 返回【本 block 及其 successors】共同使用的行号 (供上层再剔除).
    """
    start_line = block.get("start_line", 0)
    end_line = block.get("end_line", 0)
    parent_lines = set(range(start_line, end_line + 1))

    # 先收集所有后继的行号
    all_succ_lines = set()
    for succ in block.get("successors", []):
        child_lines = update_block_tree(succ, local_code)
        all_succ_lines |= child_lines

    # 现在 parent_lines - all_succ_lines 才是“父block专属”的行号
    exclusive_lines = parent_lines - all_succ_lines

    # 拼接 label
    code_lines = local_code.splitlines()
    # 排序
    ordered = sorted(list(exclusive_lines))
    label = "\n".join(code_lines[i - 1] for i in ordered)
    # 压缩空行
    label = re.sub(r"\n+", "\n", label).strip()
    block["label"] = label

    # 返回 “父 + 所有后继” 总共使用的行号
    return parent_lines | all_succ_lines


##############################
# 3) 构造树状 CFG
##############################
def build_tree_cfg_for_block(block_info: dict, code_text: str, program_language: str, block_type: str = "block") -> dict:
    """
    先用 get_code_by_line_range 剥离子class/function。然后调用大模型生成 CFG 结构。
    再对 “partial_cfg[blocks]” 做 update_block_tree，以排除后继块使用的行号。
    最后再递归处理子函数/子类，并写入 partial_cfg["functions"], partial_cfg["classes"].
    """
    # 剥离子class/function
    get_code_by_line_range(block_info, code_text, block_type)
    local_segment = block_info["label"]

    block_name = block_info.get("name", "anonymous_block")
    prompt = get_tree_cfg_prompt(local_segment, program_language, block_name)
    response_text = get_llm_answers(prompt, model_name=model_name, require_json=True)
    partial_cfg = json.loads(response_text)

    # 第一步：对 partial_cfg["blocks"] 进行“父子行号去重”
    for b in partial_cfg.get("blocks", []):
        update_block_tree(b, local_segment)

    # 第二步：继续递归处理子函数、子类
    final_functions = []
    for func_info in block_info.get("functions", []):
        sub_cfg = build_tree_cfg_for_block(func_info, code_text, program_language, "function")
        final_functions.append(sub_cfg)

    final_classes = []
    for cls_info in block_info.get("classes", []):
        sub_cfg = build_tree_cfg_for_block(cls_info, code_text, program_language, "class")
        final_classes.append(sub_cfg)

    partial_cfg["functions"] = final_functions
    partial_cfg["classes"] = final_classes

    return partial_cfg


#########################
# 处理单个文件
#########################
def process_file(file_path: str, program_language="python") -> dict:
    with open(file_path, "r", encoding="utf-8") as f:
        code_text = f.read()

    # 获取文件整体结构
    structure = find_structure(code_text, program_language)
    # 构建树状 CFG
    top_cfg = build_tree_cfg_for_block(structure, code_text, program_language)
    return top_cfg


#########################
# 批量处理
#########################
def main():
    output_dir = f"{model_name}_tree_cfg_output"
    os.makedirs(output_dir, exist_ok=True)

    source_dir = "source_code"
    files = os.listdir(source_dir)
    py_files = [f for f in files if f.endswith(".py")][:]

    def process_single_file(filename):
        input_path = os.path.join(source_dir, filename)
        output_path = os.path.join(output_dir, filename.replace(".py", ".json"))
        print(f"Processing {filename} ...")

        final_cfg = process_file(input_path, "python")
        def add_label_after_end_line(cfg_dict):
            # 处理当前层级的blocks
            for block in cfg_dict.get("blocks", []):
                if "label" in block:
                    # 保存并删除label
                    code = block.pop("label")
                    
                    # 重新按顺序构建字典
                    new_block = {}
                    for k, v in block.items():
                        new_block[k] = v
                        if k == "end_line":
                            new_block["label"] = code
                    
                    # 用重排后的字典替换原block
                    block.clear()
                    block.update(new_block)
                
                # 递归处理successors
                for successor in block.get("successors", []):
                    add_label_after_end_line({"blocks": [successor]})
            
            # 递归处理functions
            for func in cfg_dict.get("functions", []):
                add_label_after_end_line(func)
                
            # 递归处理classes  
            for cls in cfg_dict.get("classes", []):
                add_label_after_end_line(cls)
                
        add_label_after_end_line(final_cfg)
        with open(output_path, "w", encoding="utf-8") as wf:
            json.dump(final_cfg, wf, indent=2)

    from concurrent.futures import ThreadPoolExecutor
    import multiprocessing

    # 获取CPU核心数
    num_workers = multiprocessing.cpu_count()
    
    # 使用线程池并行处理文件
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(process_single_file, py_files)

    # for fn in py_files:
    #     process_single_file("60.py")


if __name__ == "__main__":
    main()

Processing 11.py ...
Processing 60.py ...
Processing 180.py ...
Processing 195.py ...
Processing 202.py ...
Processing 208.py ...
Processing 176.py ...
Processing 151.py ...
Processing 98.py ...
Processing 13.py ...
Processing 71.py ...
Processing 120.py ...
Processing 12.py ...
Processing 167.py ...
Processing 186.py ...
Processing 62.py ...
Processing 14.py ...
Processing 6.py ...
Processing 59.py ...
Processing 107.py ...
Processing 163.py ...
Processing 9.py ...
Processing 129.py ...
Processing 139.py ...
Processing 54.py ...
Processing 184.py ...
Processing 174.py ...
Processing 138.py ...
Processing 148.py ...
Processing 201.py ...
Processing 75.py ...
Processing 55.py ...
Processing 159.py ...
Processing 160.py ...
Processing 123.py ...
Processing 29.py ...
Processing 100.py ...
Processing 45.py ...
Processing 116.py ...
Processing 25.py ...
Processing 89.py ...
Processing 28.py ...
Processing 86.py ...
Processing 99.py ...
Processing 19.py ...
Processing 90.py ...
Processing 13

In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import difflib
import re
from typing import Dict, Any, List, Tuple, Set


############################################
# 1) 收集子图(顶层/class/function)
############################################

def collect_subgraphs(cfg: dict, prefix: str="") -> Dict[str, dict]:
    """
    递归收集子图:
    - 每一层(顶层, class, function)变成一个subgraph
    - subgraph包含: {
         "prefix": prefix,
         "blocks": [  # block结构保持原状
            {
              "label": "...",
              "successors": [...],  # 仍然是block对象list
              ...
            },
            ...
         ]
      }
    - 并进一步递归function/classes
    返回: { prefix -> subgraph }
    """

    result = {}

    # 当前层 subgraph
    sub = {
        "prefix": prefix,
        "blocks": cfg.get("blocks", [])
    }
    result[prefix] = sub

    # 递归 functions
    for f in cfg.get("functions", []):
        fname = f.get("name","")
        subpfx = prefix + "." + fname if prefix else fname
        result.update( collect_subgraphs(f, subpfx) )

    # 递归 classes
    for c in cfg.get("classes", []):
        cname = c.get("name","")
        subpfx = prefix + "." + cname if prefix else cname
        result.update( collect_subgraphs(c, subpfx) )

    return result


############################################
# 2) 工具: label 相似度 & blocks 匹配
############################################

def label_similarity(a: str, b: str) -> float:
    """ difflib 计算字符串相似度 """
    return difflib.SequenceMatcher(None, a, b).ratio()

def match_blocks_by_label(blocksA: List[dict], blocksB: List[dict], threshold: float=0.6):
    """
    用 label 相似度做贪心匹配:
    - blocksA每个block, 在blocksB里找到相似度最高者,若>threshold,则匹配
    - blocksB若被匹配一次后就不可再用
    返回:
      matched_pairs: List of (idxA, idxB)
      matched_count
      totalA = len(blocksA)
      totalB = len(blocksB)
    """
    matched_pairs = []
    usedB = set()

    for i, ba in enumerate(blocksA):
        best_sim = 0
        best_j = -1
        for j, bb in enumerate(blocksB):
            if j in usedB:
                continue
            sim = label_similarity(ba.get("label",""), bb.get("label",""))
            if sim>best_sim:
                best_sim = sim
                best_j = j
        if best_sim>=threshold and best_j!=-1:
            matched_pairs.append( (i,best_j) )
            usedB.add(best_j)

    return matched_pairs, len(blocksA), len(blocksB)


############################################
# 3) 在已匹配的blocks对里 对比 edges
############################################

def collect_edges(blocks: List[dict]) -> List[Tuple[int,int]]:
    """
    收集 edges: (i->j) 其中 i,j 是block的索引
    - block i 的 successors 里面若出现 block j, 就 (i->j)
    - 但这里 successors 是 block对象(带label等),
      我们要知道 j 是 blocks 里的哪一个, 用 label匹配?
      => 做一个 label->index(多对多?), 但若 label 不唯一就有歧义.
    这里简化: 遍历 i 的 successors, 在 blocks 里做 label相等者.
    """

    edges = []
    # 预先把 blocks label -> possible indices
    # 如果label不唯一，则是一对多
    label_map = {}
    for i, b in enumerate(blocks):
        lbl = b.get("label","")
        label_map.setdefault(lbl, []).append(i)

    for i, blockA in enumerate(blocks):
        succ_list = blockA.get("successors", [])
        # succ_list是一堆block
        for sblock in succ_list:
            slabel = sblock.get("label","")
            # 在 label_map 里找 slabel
            if slabel in label_map:
                # 可能对应多个candidate, 全都视为后继?
                for j in label_map[slabel]:
                    edges.append( (i,j) )
            else:
                # 不匹配任何,忽略
                pass

    return edges


def compare_edges(
    blocksA: List[dict], 
    blocksB: List[dict], 
    matched_pairs: List[Tuple[int,int]]
) -> Dict[str,float]:
    """
    通过 matched_pairs (block i in A -> block j in B),
    收集 edgesA, edgesB, 并做映射:
      - 在 A: (i->x), 若 i->x & i,x 都匹配到 (j->y)
      - 那就构成 (j->y) in B, check if B真有 (j->y).
    计算 edge-level precision/recall/f1.
    """

    def safe_div(a,b): return a/b if b else 1

    # 先收集 A/B 的 edges( i->x )  / ( j->y ) indices
    edgesA = set( collect_edges(blocksA) )  # set of (i,x)
    edgesB = set( collect_edges(blocksB) )  # set of (j,y)

    # 建立 matched_A->B
    mapAtoB = dict(matched_pairs)  # i->j
    # 同理, matched_BtoA = dict( (j,i) for i,j in matched_pairs )

    # 对于 A 的一条 (i->x), 若 i,x 都在 matched_pairs, -> (j->y)
    # 如果 (j->y) in edgesB,则此edge算 matched
    matched_edges = 0
    for (i,x) in edgesA:
        if i in mapAtoB and x in mapAtoB:
            j = mapAtoB[i]
            y = mapAtoB[x]
            if (j,y) in edgesB:
                matched_edges+=1

    # p/r
    # #edges in A = len(edgesA), #edges in B= len(edgesB)
    p = safe_div(matched_edges, len(edgesA))
    r = safe_div(matched_edges, len(edgesB))
    f1 = 2*p*r/(p+r) if (p+r)>0 else 0
    return {
      "edge_precision": p,
      "edge_recall": r,
      "edge_f1": f1
    }


############################################
# 4) compare_single_subgraph: blocks + edges
############################################

def compare_single_subgraph(
    subA: dict, 
    subB: dict, 
    threshold: float = 0.6
) -> Dict[str,float]:
    """
    对同一个 prefix 的子图:
      1) match blocks by label
      2) compute block-level p/r/f1
      3) compute edge-level p/r/f1
    返回: { block_precision, block_recall, block_f1, edge_precision, edge_recall, edge_f1}
    """

    blocksA = subA["blocks"]
    blocksB = subB["blocks"]

    matched_pairs, totalA, totalB = match_blocks_by_label(blocksA, blocksB, threshold)
    matched_count = len(matched_pairs)

    def safe_div(a,b): return a/b if b else 1.0
    block_precision = safe_div(matched_count, totalA)
    block_recall = safe_div(matched_count, totalB)
    block_f1 = 2*block_precision*block_recall/(block_precision+block_recall) if (block_precision+block_recall)>0 else 0

    # edges
    edge_result = compare_edges(blocksA, blocksB, matched_pairs)

    return {
      "block_precision": block_precision,
      "block_recall": block_recall,
      "block_f1": block_f1,
      "edge_precision": edge_result["edge_precision"],
      "edge_recall": edge_result["edge_recall"],
      "edge_f1": edge_result["edge_f1"],
    }


############################################
# 5) compare_all_subgraphs: 挨个 prefix
############################################

def compare_all_subgraphs(cfg_llm: dict, cfg_static: dict, threshold: float=0.6) -> Dict[str,Any]:
    """
    1) collect_subgraphs for LLM & static
    2) for each prefix in LLM, see if exists in static
       - if yes, compare_single_subgraph
       - if no, record extra
    3) record missing subgraphs
    4) average the metrics
    """

    subs_llm = collect_subgraphs(cfg_llm)
    subs_stc = collect_subgraphs(cfg_static)

    matched_results = []
    extra_subgraphs = []
    missing_subgraphs = []

    for prefix, subA in subs_llm.items():
        if prefix in subs_stc:
            subB = subs_stc[prefix]
            metrics = compare_single_subgraph(subA, subB, threshold=threshold)
            matched_results.append( (prefix, metrics) )
        else:
            extra_subgraphs.append(prefix)

    for prefix in subs_stc:
        if prefix not in subs_llm:
            missing_subgraphs.append(prefix)

    # 统计平均
    if matched_results:
        block_p = sum(x[1]["block_precision"] for x in matched_results)/len(matched_results)
        block_r = sum(x[1]["block_recall"] for x in matched_results)/len(matched_results)
        block_f1= sum(x[1]["block_f1"] for x in matched_results)/len(matched_results)
        edge_p  = sum(x[1]["edge_precision"] for x in matched_results)/len(matched_results)
        edge_r  = sum(x[1]["edge_recall"] for x in matched_results)/len(matched_results)
        edge_f1 = sum(x[1]["edge_f1"] for x in matched_results)/len(matched_results)
    else:
        block_p=block_r=block_f1=1
        edge_p=edge_r=edge_f1=1

    return {
      "matched_subgraphs_count": len(matched_results),
      "extra_subgraphs_in_llm": extra_subgraphs,
      "missing_subgraphs_in_static": missing_subgraphs,

      "avg_block_precision_subgraphs": block_p,
      "avg_block_recall_subgraphs": block_r,
      "avg_block_f1_subgraphs": block_f1,

      "avg_edge_precision_subgraphs": edge_p,
      "avg_edge_recall_subgraphs": edge_r,
      "avg_edge_f1_subgraphs": edge_f1
    }


############################################
# 6) 可选: compare_structure (类/函数命名)
############################################

def collect_defs(cfg: dict, prefix="") -> (Set[str], Set[str]): # type: ignore
    classes = set()
    funcs   = set()
    # classes
    for c in cfg.get("classes",[]):
        cname = c.get("name","")
        cfull = f"{prefix}.{cname}" if prefix else cname
        classes.add(cfull)
        sc, sf = collect_defs(c, cfull)
        classes |= sc
        funcs   |= sf
    # functions
    for f in cfg.get("functions",[]):
        fname = f["name"]
        ffull = f"{prefix}.{fname}" if prefix else fname
        funcs.add(ffull)
        sc, sf = collect_defs(f, ffull)
        classes |= sc
        funcs   |= sf
    return classes, funcs

def compare_structure(cfg_llm: dict, cfg_static: dict) -> Dict[str,float]:
    def safe_div(a,b): return a/b if b else 1.0

    llmC,llmF = collect_defs(cfg_llm)
    stcC,stcF = collect_defs(cfg_static)

    matchedC = llmC & stcC
    matchedF = llmF & stcF

    cp = safe_div(len(matchedC), len(llmC))
    cr = safe_div(len(matchedC), len(stcC))
    cf1= 2*cp*cr/(cp+cr) if (cp+cr)>0 else 0

    fp = safe_div(len(matchedF), len(llmF))
    fr = safe_div(len(matchedF), len(stcF))
    ff1= 2*fp*fr/(fp+fr) if (fp+fr)>0 else 0

    return {
      "class_precision": cp,
      "class_recall": cr,
      "class_f1": cf1,
      "function_precision": fp,
      "function_recall": fr,
      "function_f1": ff1
    }


############################################
# 7) 主 compare_cfgs: 汇总子图 & 结构
############################################

def compare_cfgs(cfg_llm: dict, cfg_static: dict, threshold: float=0.6) -> Dict[str,Any]:
    """
    1) compare structure (可选)
    2) compare_all_subgraphs: blocks + edges
    3) combine final
    """
    structure = compare_structure(cfg_llm, cfg_static)

    subgraph_metrics = compare_all_subgraphs(cfg_llm, cfg_static, threshold=threshold)

    # 一个简单overall:
    # structure f1
    struct_f1 = (structure["class_f1"] + structure["function_f1"]) / 2
    # subgraph block/edge f1
    block_f1 = subgraph_metrics["avg_block_f1_subgraphs"]
    edge_f1  = subgraph_metrics["avg_edge_f1_subgraphs"]
    # overall
    overall = (struct_f1 + block_f1 + edge_f1)/3

    return {
      "structure_metrics": structure,
      "subgraph_metrics": subgraph_metrics,
      "overall_score": overall
    }


############################################
# 8) main
############################################

def main():
    # 示例: 你可以改成 argparse
    total_results = {
        "overall_scores": [],
        "structure_metrics": {
            "class_precision": [],
            "class_recall": [],
            "class_f1": [],
            "function_precision": [],
            "function_recall": [],
            "function_f1": []
        },
        "block_metrics": {
            "precision": [],
            "recall": [],
            "f1": []
        },
        "edge_metrics": {
            "precision": [],
            "recall": [],
            "f1": []
        },
        "subgraph_stats": {
            "matched_counts": [],
            "extra_counts": [],
            "missing_counts": []
        }
    }
    
    for i in range(200):
        llm_cfg_path = f"gpt-4o_tree_cfg_output/{i}.json"
        static_cfg_path = f"static_cfg/{i}.json"

        if not os.path.exists(llm_cfg_path) or not os.path.exists(static_cfg_path):
            continue

        with open(llm_cfg_path, "r", encoding="utf-8") as f:
            cfg_llm = json.load(f)
        with open(static_cfg_path, "r", encoding="utf-8") as f:
            cfg_static = json.load(f)

        results = compare_cfgs(cfg_llm, cfg_static, threshold=0.6)
        
        # 收集所有指标
        total_results["overall_scores"].append(results["overall_score"])
        
        # 结构指标
        for metric in ["class_precision", "class_recall", "class_f1", 
                      "function_precision", "function_recall", "function_f1"]:
            total_results["structure_metrics"][metric].append(
                results["structure_metrics"][metric])
        
        # 基本块指标
        total_results["block_metrics"]["precision"].append(
            results["subgraph_metrics"]["avg_block_precision_subgraphs"])
        total_results["block_metrics"]["recall"].append(
            results["subgraph_metrics"]["avg_block_recall_subgraphs"])
        total_results["block_metrics"]["f1"].append(
            results["subgraph_metrics"]["avg_block_f1_subgraphs"])
            
        # 边指标
        total_results["edge_metrics"]["precision"].append(
            results["subgraph_metrics"]["avg_edge_precision_subgraphs"])
        total_results["edge_metrics"]["recall"].append(
            results["subgraph_metrics"]["avg_edge_recall_subgraphs"])
        total_results["edge_metrics"]["f1"].append(
            results["subgraph_metrics"]["avg_edge_f1_subgraphs"])
            
        # 子图统计
        total_results["subgraph_stats"]["matched_counts"].append(
            results["subgraph_metrics"]["matched_subgraphs_count"])
        total_results["subgraph_stats"]["extra_counts"].append(
            len(results["subgraph_metrics"]["extra_subgraphs_in_llm"]))
        total_results["subgraph_stats"]["missing_counts"].append(
            len(results["subgraph_metrics"]["missing_subgraphs_in_static"]))
    
    def avg(lst):
        return sum(lst)/len(lst) if lst else 0
    
    # 打印详细的评估结果
    print("\n=== 总体评估结果 ===")
    print(f"平均整体得分: {avg(total_results['overall_scores']):.3f}")
    
    print("\n=== 结构评估 ===")
    print(f"类评估:")
    print(f"  - Precision: {avg(total_results['structure_metrics']['class_precision']):.3f}")
    print(f"  - Recall: {avg(total_results['structure_metrics']['class_recall']):.3f}")
    print(f"  - F1: {avg(total_results['structure_metrics']['class_f1']):.3f}")
    print(f"函数评估:")
    print(f"  - Precision: {avg(total_results['structure_metrics']['function_precision']):.3f}")
    print(f"  - Recall: {avg(total_results['structure_metrics']['function_recall']):.3f}")
    print(f"  - F1: {avg(total_results['structure_metrics']['function_f1']):.3f}")
    
    print("\n=== 基本块评估 ===")
    print(f"Precision: {avg(total_results['block_metrics']['precision']):.3f}")
    print(f"Recall: {avg(total_results['block_metrics']['recall']):.3f}")
    print(f"F1: {avg(total_results['block_metrics']['f1']):.3f}")
    
    print("\n=== 边评估 ===")
    print(f"Precision: {avg(total_results['edge_metrics']['precision']):.3f}")
    print(f"Recall: {avg(total_results['edge_metrics']['recall']):.3f}")
    print(f"F1: {avg(total_results['edge_metrics']['f1']):.3f}")
    
    print("\n=== 子图统计 ===")
    print(f"平均匹配子图数: {avg(total_results['subgraph_stats']['matched_counts']):.1f}")
    print(f"平均额外子图数: {avg(total_results['subgraph_stats']['extra_counts']):.1f}")
    print(f"平均缺失子图数: {avg(total_results['subgraph_stats']['missing_counts']):.1f}")


if __name__ == "__main__":
    main()


=== 总体评估结果 ===
平均整体得分: 0.800

=== 结构评估 ===
类评估:
  - Precision: 0.997
  - Recall: 0.997
  - F1: 0.997
函数评估:
  - Precision: 0.997
  - Recall: 0.993
  - F1: 0.995

=== 基本块评估 ===
Precision: 0.619
Recall: 0.431
F1: 0.407

=== 边评估 ===
Precision: 0.997
Recall: 1.000
F1: 0.997

=== 子图统计 ===
平均匹配子图数: 9.8
平均额外子图数: 0.0
平均缺失子图数: 0.1


In [1]:
def get_evaluate_prompt(llm_cfg, static_cfg):
    prompt = f"""
You are an expert in static analysis of Python code. I will provide you with two JSON representations of Control Flow Graphs (CFGs). One CFG is generated by a language model (LLM-based CFG), and the other CFG is generated by a static analyzer (Static-based CFG). I want you to compare them across multiple dimensions to see how closely they match. Specifically, please analyze:
	1.	Classes and Functions Structure
	•	Compare which classes and functions appear in each CFG.
	•	Check if they are nested consistently (e.g., whether a function is nested inside the same class in both CFGs).
	•	Point out any classes/functions missing or extra in one CFG vs. the other.
	2.	Blocks
	•	Look at each block in the CFG, focusing on how these blocks are formed. Compare their labels or relevant content.
	•	Are the blocks subdivided in the same places (e.g., same if/else boundaries) or does one CFG merge multiple statements into a single block while the other splits them?
	•	Discuss any significant differences in how blocks are structured, labeled, or combined.
	3.	Edges / Successors
	•	Check the connections or transitions between blocks (successors).
	•	Identify if the same control-flow edges appear in both CFGs. For instance, do they agree on the branching after an if-statement, or do they differ?
	•	Point out missing edges or extra edges.
	4.	Any Additional Observations
	•	Note any unusual merges/splits of blocks, or differences in how special constructs (try-except, for/while, etc.) are represented.
	•	If line numbers exist, mention whether they differ significantly.
	5.	Summary and Recommendations
	•	Provide an overall assessment: Are the CFGs mostly matching, or are there major discrepancies?
	•	Suggest how accurate the LLM-based CFG is relative to the static-based CFG, potentially giving a numerical score or rating if you can.

Instructions:
	•	Read both CFGs carefully.
    •	Treat static CFG as the ground truth.
	•	Summarize your findings for each dimension in a clear, structured way.
	•	Finally, give a single consolidated assessment or score about how well the LLM CFG aligns with the static CFG across all these aspects.

Output Format:
	•	You may respond in plain English, or use a structured JSON (or bullet points) that highlights each dimension's differences, plus an overall conclusion.
	•	If needed, give specific examples of discrepancies or matches, referencing the class/function/block labels where appropriate.

I will now provide the two CFG JSON objects. Please follow the instructions above to perform a multi-dimensional comparison.

static CFG:
{static_cfg}

LLM CFG:
{llm_cfg}

Your output should be in JSON format as follows: """ + """
{
	"overall_scores": number(0-100),
    "structure_similarity": number(0-1),
    "block_similarity": number(0-1),
    "edge_similarity": number(0-1)
}
""" 
    return prompt

from llm import get_llm_answers
import json
import os
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
import tqdm

def process_single_file(i):
    
    # 创建输出目录
    os.makedirs("results/gpt-4o/evaluate", exist_ok=True)
    
    # 保存结果到对应文件
    output_path = f"results/gpt-4o/evaluate/{i}.json"
    llm_cfg_path = f"gpt-4o_tree_cfg_output/{i}.json"
    static_cfg_path = f"static_cfg/{i}.json"

    if os.path.exists(output_path):
        return json.load(open(output_path, "r", encoding="utf-8"))

    if not os.path.exists(llm_cfg_path) or not os.path.exists(static_cfg_path):
        return
    with open(llm_cfg_path, "r", encoding="utf-8") as f:
        cfg_llm = json.load(f)
    with open(static_cfg_path, "r", encoding="utf-8") as f:
        cfg_static = json.load(f)

    prompt = get_evaluate_prompt(cfg_llm, cfg_static)

    try:
        llm_answers = json.loads(get_llm_answers(prompt, model_name="gpt-4o", require_json=True))
    except Exception as e:
        return {
            "overall_scores": 0,
            "structure_similarity": 0,
            "block_similarity": 0,
            "edge_similarity": 0
        }
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(llm_answers, f, indent=2)
    
    return llm_answers

def main():
    # 使用ProcessPoolExecutor代替ThreadPoolExecutor以实现真正的并行
    from concurrent.futures import ThreadPoolExecutor
    import multiprocessing

    # 获取CPU核心数
    num_workers = multiprocessing.cpu_count()
    
    # 使用线程池并行处理文件
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        tasks = [i for i in range(200)]
        results = list(tqdm.tqdm(
            executor.map(process_single_file, tasks),
            total=len(tasks),
            desc="处理文件"
        ))
        results = [r for r in results if r is not None]
    
    # 计算总体分数
    total_scores = {
        "overall_scores": [],
        "structure_similarity": [],
        "block_similarity": [], 
        "edge_similarity": []
    }
    
    for result in results:
        for key in total_scores:
            if key in result:
                total_scores[key].append(result[key])
    
    # 计算平均分数
    avg_scores = {
        key: np.mean(scores) for key, scores in total_scores.items() if scores
    }
    
    print("总体评估分数:")
    for key, score in avg_scores.items():
        print(f"{key}: {score:.2f}")

if __name__ == "__main__":
    main()


处理文件:  62%|██████▏   | 123/200 [00:12<00:07, 10.18it/s]


JSONDecodeError: Extra data: line 8 column 1 (char 121)