### Easy Generate CFG

#### LLM

效果不太好，还是需要我们一步步进行处理！

step1 先将文件的嵌套类，方法给找到

In [1]:
from concurrent.futures import as_completed
from multiprocessing import cpu_count
from llm import get_llm_answers
import json

def get_step1_prompt(code_text: str, program_language: str):
    """
    生成第一步的Prompt
    """
    code_lines = code_text.splitlines()
    code_lines_json = [{
        "line": i + 1,
        "code": line
    } for i, line in enumerate(code_lines)]

    prompt = """
You are given a piece of """ +  program_language + """ code. Your goal is to find all the nested classes and methods in the code.

Please return the result in JSON format, your output should be the following format:

```json
{
    "name": "example_script",  // Name of the script or function
    "type": "CFG",
    "start_line": number,
    "end_line": number,
    "functions": [
      {
        "name": "function_name",
        "type": "function",
        "start_line": number,
        "end_line": number,
        "functions": [],         // Nested functions
        "classes": []            // Nested classes
      }
    ],
    "classes": [
      {
        "name": "class_name",
        "type": "class",
        "start_line": number,
        "end_line": number,
        "functions": [           // Methods of the class
          {
            "name": "method_name",
            "type": "function",
            "start_line": number,
            "end_line": number,
            "functions": [],     // Nested functions
            "classes": []        // Nested classes
          }
        ]
      }
    ]
}
```
The code lines are:
""" + json.dumps(code_lines_json, indent=2) + """
IMPORTANT: Make sure that the nested classes and methods are in the correct level. For example, if a function is nested in another class, the function should be in the nested class's functions list. 
Besides, if a class is nested in another class, the class should be in the nested class's classes list.
"""
    
    return prompt

def find_nested_classes_and_methods(code_text: str, program_language):
    """
    找到文件中的嵌套类，方法
    """
    prompt = get_step1_prompt(code_text, program_language)
    response = get_llm_answers(prompt, model_name="gpt-4o", require_json=True)
    nested_classes_and_methods = json.loads(response)
    return nested_classes_and_methods

def process_file_with_chain_of_thought(code_text: str, program_language: str):
    """
    读取 Python 文件 -> 生成’思维链’式Prompt -> 调用大模型 -> 写入结果JSON
    """
    # with open(input_file, "r", encoding="utf-8") as f:
    #     code_text = f.read()

    # 找到文件中的嵌套类，方法
    step1_result = find_nested_classes_and_methods(code_text, program_language)
    # print(json.dumps(step1_result, indent=2))

    return step1_result

from difflib import SequenceMatcher


def get_code_by_line_range(code_block, code):
    code_lines = code.splitlines()
    start_line = code_block["start_line"]
    end_line = code_block["end_line"] + 1

    ## start_line 到 end_line 之间的代码， 但是要减去自身class和function的代码
    line_set = set(range(start_line, end_line))
    for func in code_block.get("functions", []):
        func_start_line = func.get("start_line", 0)
        func_end_line = func.get("end_line", 0)
        line_set.difference_update(range(func_start_line, func_end_line))

    for cls in code_block.get("classes", []):
        cls_start_line = cls.get("start_line", 0)
        cls_end_line = cls.get("end_line", 0)
        line_set.difference_update(range(cls_start_line, cls_end_line))

    # 将line_set转换为有序列表并排序,确保按行号顺序
    ordered_lines = sorted(list(line_set))
    sum_code = "\n".join([code_lines[i-1] for i in ordered_lines])

    code_block["simplified_code"] = sum_code

def recursive_get_code_by_line_range(code_block, code):
    get_code_by_line_range(code_block, code)
    for func in code_block.get("functions", []):
        recursive_get_code_by_line_range(func, code)
    for cls in code_block.get("classes", []):
        recursive_get_code_by_line_range(cls, code)


def print_simplified_code(code_block: dict, indent=0):
    """
    递归遍历并打印 simplified_code
    """
    print(" " * indent + "简化后的代码:")
    print(" " * indent + code_block.get("simplified_code", "").strip())

    # 递归处理嵌套的类
    for class_block in code_block.get("classes", []):
        print(" " * indent + f"\n类 {class_block.get('name', '')}:")
        print_simplified_code(class_block, indent + 2)

    # 递归处理嵌套的函数
    for function_block in code_block.get("functions", []):
        print(" " * indent + f"\n函数 {function_block.get('name', '')}:")
        print_simplified_code(function_block, indent + 2)

def get_code_cfg_prompt(code, program_language):
    """
    生成代码的CFG的Prompt
    """
    prompt = f"""
You will be given a piece of **{program_language}** code. Your goal is to generate a **Control Flow Graph (CFG)** for this code and output the result as **JSON**. Here are the specific requirements:

1. **Definition of Basic Blocks**:
   - A basic block can contain one or more “continuous and unbranched” statements.
   - Whenever you encounter a statement that causes a flow jump or branch (e.g., `if-else`, `for-while`, `try-except-finally`, `with-as`, `match-case`, `break-continue-return`, etc.), you should start a new basic block.

2. **JSON Output Structure**:
   - Your output must strictly follow this JSON format, with no additional text or explanation:
```json
{{
  "blocks": [
    {{
      "id": 1,
      "label": "if a > 2:",
      "successors": [
        {{
          "id": 2,
          "label": "    print(a)",
          "successors": [
            {{
              "id": 4,
              "label": "print(\\"done\\")",
              "successors": []
            }}
          ]
        }},
        {{
          "id": 3,
          "label": "    print(1)",
          "successors": [
            {{
              "id": 4,
              "label": "print(\\"done\\")",
              "successors": []
            }}
          ]
        }}
      ]
    }}
  ]
}}
```
   - **id**: an integer starting from 1, incrementing by 1 for each block.
   - **label**: the exact code inside this block, unchanged.
   - **successors**: a list of nested blocks that may execute after this block. Each item in this list is itself a block with the same structure: "id", "label", and "successors".

3. **Branch Structures**:
   - **if-else**: for `if condition: ... else: ...`, both the `if` body and the `else` body should be separate blocks. The `if` block’s `"successors"` should include both branches as nested block objects.
   - **for-while**: the loop body and the statement(s) following the loop should be in different blocks, with correct flow back to the loop condition if it continues, or forward to the next block if it terminates. Again, these should appear as nested block objects in `"successors"`.
   - **try-except-finally**: each `try`, `except`, and `finally` block should be identified separately, showing normal and exceptional flows by nesting them in `"successors"`.
   - **with-as**: the code inside the `with` statement and the code after the `with` block should be separate blocks, reflected as nested structures.
   - **match-case**: treat each `case` body as a separate nested block in `"successors"`.
   - **break-continue-return**: these statements jump to outside of the loop, back to the loop condition, or end the function. If the function ends, the `"successors"` can be an empty list.

4. **Final Output**:
   - Ensure your output is valid JSON (only one root object, containing `"blocks"`).
   - Do not add extra text or explanation—only the JSON object itself.

---
**Below is the code you need to analyze. Replace `{program_language}` with the actual language and insert the given code where indicated. Then generate the final JSON output strictly following the above rules:**

```python
# Example input code
{code}
```
"""
    return prompt


def get_single_block_cfg(code_block, program_languge):
    """
    获取每个代码块的CFG
    """
    prompt = get_code_cfg_prompt(code_block["simplified_code"], program_languge)
    response = get_llm_answers(prompt, model_name="gpt-4o", require_json=True)
    blocks = json.loads(response)["blocks"]
    code_block["blocks"] = blocks
    
def recursive_get_each_block_cfg(code_block, program_language):
    """
    递归获取每个代码块的CFG
    """
    get_single_block_cfg(code_block, program_language)
    for block in code_block.get("classes", []):
        recursive_get_each_block_cfg(block, program_language)
    for block in code_block.get("functions", []):
        recursive_get_each_block_cfg(block, program_language)

import os
from tqdm import tqdm
def main():
    from concurrent.futures import ThreadPoolExecutor
    from functools import partial
    def process_single_file(file, target_file):
        if os.path.exists(target_file):
            return
        print("Processing " + file)
        with open(file, 'r') as f:
            code = f.read()
        step1_result = process_file_with_chain_of_thought(code, "ts")
        recursive_get_code_by_line_range(step1_result, code)
        recursive_get_each_block_cfg(step1_result, "ts")

        def remove_duplicate_blocks(code_block):
            """
            删除同一层级中start_line和end_line相同的重复块,仅保留最前面的一个
            """
            if "blocks" in code_block:
                # 用于记录已经出现过的(start_line, end_line)组合
                seen = set()
                # 用于存储不重复的blocks
                unique_blocks = []
                
                for block in code_block["blocks"]:
                    # 如果block有start_line和end_line属性
                    if "start_line" in block and "end_line" in block:
                        key = (block["start_line"], block["end_line"])
                        if key not in seen:
                            seen.add(key)
                            unique_blocks.append(block)
                    else:
                        # 如果没有这些属性,保留该block
                        unique_blocks.append(block)
                        
                code_block["blocks"] = unique_blocks
            
            # 递归处理子块
            for sub_block in code_block.get("classes", []):
                remove_duplicate_blocks(sub_block)
            for sub_block in code_block.get("functions", []):
                remove_duplicate_blocks(sub_block)
                
        # 处理整个CFG
        remove_duplicate_blocks(step1_result)
        
        with open(target_file, "w") as f:
            json.dump(step1_result, f, indent=2)

    target_dir = "llm_cfg"
    source_code_dir = "../../dataset/ts"
    os.makedirs(target_dir, exist_ok=True)

    files = [(source_code_dir + "/" + str(i) + ".ts", target_dir + "/" + str(i) + ".json") for i in range(200)]
    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        for file_pair in files:
            futures.append(executor.submit(process_single_file, *file_pair))
        
        for _ in tqdm(as_completed(futures), total=len(files), desc="处理CFG文件"):
            pass

main()


Processing ../../dataset/ts/0.ts
Processing ../../dataset/ts/1.ts
Processing ../../dataset/ts/2.ts
Processing ../../dataset/ts/3.ts
Processing ../../dataset/ts/4.ts
Processing ../../dataset/ts/5.ts
Processing ../../dataset/ts/6.ts
Processing ../../dataset/ts/7.ts
Processing ../../dataset/ts/8.ts
Processing ../../dataset/ts/9.ts
Processing ../../dataset/ts/10.ts
Processing ../../dataset/ts/11.ts
Processing ../../dataset/ts/12.ts
Processing ../../dataset/ts/13.ts
Processing ../../dataset/ts/14.ts
Processing ../../dataset/ts/15.ts
Processing ../../dataset/ts/16.ts
Processing ../../dataset/ts/17.ts
Processing ../../dataset/ts/18.ts
Processing ../../dataset/ts/19.ts
Processing ../../dataset/ts/20.ts
Processing ../../dataset/ts/22.ts
Processing ../../dataset/ts/23.ts
Processing ../../dataset/ts/24.ts
Processing ../../dataset/ts/25.ts
Processing ../../dataset/ts/21.ts
Processing ../../dataset/ts/27.ts
Processing ../../dataset/ts/26.ts
Processing ../../dataset/ts/28.ts
Processing ../../dataset

处理CFG文件:   0%|          | 1/200 [00:10<33:51, 10.21s/it]

Processing ../../dataset/ts/112.ts


处理CFG文件:   2%|▏         | 3/200 [00:11<08:26,  2.57s/it]

Processing ../../dataset/ts/113.ts
Processing ../../dataset/ts/114.ts


处理CFG文件:   2%|▏         | 4/200 [00:11<05:29,  1.68s/it]

Processing ../../dataset/ts/115.ts


处理CFG文件:   2%|▎         | 5/200 [00:12<04:42,  1.45s/it]

Processing ../../dataset/ts/116.ts


处理CFG文件:   3%|▎         | 6/200 [00:13<04:19,  1.34s/it]

Processing ../../dataset/ts/117.ts


处理CFG文件:   4%|▎         | 7/200 [00:13<03:11,  1.01it/s]

Processing ../../dataset/ts/118.ts


处理CFG文件:   4%|▍         | 8/200 [00:14<03:10,  1.01it/s]

Processing ../../dataset/ts/119.ts


处理CFG文件:   4%|▍         | 9/200 [00:15<02:51,  1.11it/s]

Processing ../../dataset/ts/120.ts


处理CFG文件:   5%|▌         | 10/200 [00:16<03:03,  1.04it/s]

Processing ../../dataset/ts/121.ts


处理CFG文件:   6%|▌         | 12/200 [00:17<01:48,  1.74it/s]

Processing ../../dataset/ts/122.ts
Processing ../../dataset/ts/123.ts
Processing ../../dataset/ts/124.ts


处理CFG文件:   7%|▋         | 14/200 [00:17<01:09,  2.68it/s]

Processing ../../dataset/ts/125.ts
Processing ../../dataset/ts/126.ts


处理CFG文件:   8%|▊         | 16/200 [00:17<00:53,  3.46it/s]

Processing ../../dataset/ts/127.ts


处理CFG文件:   8%|▊         | 17/200 [00:18<00:56,  3.26it/s]

Processing ../../dataset/ts/128.ts


处理CFG文件:  45%|████▌     | 90/200 [00:18<00:01, 69.15it/s]

Processing ../../dataset/ts/129.ts
Processing ../../dataset/ts/130.ts
Processing ../../dataset/ts/131.ts
Processing ../../dataset/ts/132.ts
Processing ../../dataset/ts/133.ts
Processing ../../dataset/ts/134.ts
Processing ../../dataset/ts/135.ts
Processing ../../dataset/ts/136.ts
Processing ../../dataset/ts/137.ts
Processing ../../dataset/ts/138.ts
Processing ../../dataset/ts/139.ts
Processing ../../dataset/ts/140.ts
Processing ../../dataset/ts/141.ts
Processing ../../dataset/ts/142.ts
Processing ../../dataset/ts/143.ts
Processing ../../dataset/ts/144.ts
Processing ../../dataset/ts/145.ts
Processing ../../dataset/ts/146.ts
Processing ../../dataset/ts/147.ts
Processing ../../dataset/ts/148.ts
Processing ../../dataset/ts/149.ts
Processing ../../dataset/ts/150.ts
Processing ../../dataset/ts/151.ts
Processing ../../dataset/ts/152.ts
Processing ../../dataset/ts/153.ts
Processing ../../dataset/ts/154.ts
Processing ../../dataset/ts/155.ts
Processing ../../dataset/ts/156.ts
Processing ../../dat

处理CFG文件: 100%|██████████| 200/200 [14:51<00:00,  4.46s/it] 


### LLM生成的代码可能可以合并

In [2]:
import os
import json

def process_cfg(cfg):
    """
    Process a CFG that uses a *nested successors* structure.
    We will:
      1. Remove unreachable blocks (only keep blocks reachable from the root).
      2. Separate loop headers from loop bodies (if desired).
      3. Merge consecutive linear blocks that have only one successor and one predecessor.
      4. Recursively process functions/classes if they exist.
    """

    #=== 1. 过滤不可达节点: 我们假设 blocks[0] 是 CFG 的根节点 ===#
    def filter_connected_blocks(blocks):
        """
        Given a list of blocks (in nested form), return only those reachable
        from the 'root' block (which we assume is blocks[0]) by traversing
        nested successors.
        """

        visited_ids = set()
        # 为了方便在后面快速通过 id 找到对应的 block 对象，我们先做一个 {id: block} 的映射
        # 同时存储所有 block 的引用（因为是嵌套的，需要把内部 successors 里的 block 也加入到此映射）
        id_to_block = {}

        def collect_all_blocks(block_list):
            for b in block_list:
                id_to_block[b["id"]] = b
                if "successors" in b:
                    collect_all_blocks(b["successors"])

        collect_all_blocks(blocks)

        # 深度优先搜索，查找所有可达节点
        def dfs(block):
            if block["id"] in visited_ids:
                return
            visited_ids.add(block["id"])
            for succ_block in block.get("successors", []):
                dfs(succ_block)

        # 假定 blocks[0] 是 root
        if blocks:
            root_block = blocks[0]
            dfs(root_block)

        # 现在我们只保留被 visited_ids 覆盖到的节点，并且需要“剪枝”不在 visited_ids 中的后继
        def filter_nested(block_list):
            """在嵌套结构中移除不可达节点。"""
            filtered = []
            for b in block_list:
                if b["id"] in visited_ids:
                    # 递归处理 successors
                    new_successors = filter_nested(b.get("successors", []))
                    filtered.append({
                        "id": b["id"],
                        "label": b["label"],
                        "successors": new_successors
                    })
            return filtered

        return filter_nested(blocks)

    #=== 2. 判断循环头（示例仅以 "for" / "while" 关键字简单判断） ===#
    def is_loop_header(block):
        """
        A naive check: if the block's label starts with 'for' or 'while'
        (or contains those keywords in a relevant way), treat it as a loop header.
        """
        code_str = block["label"].strip()
        if code_str.startswith("for ") or code_str.startswith("while "):
            return True
        return False

    #=== 3. 合并逻辑（循环头和循环体暂时不做复杂拆分，仅演示思路） ===#
    # 在嵌套结构中，“连续的线性块”通常表现为一个 block 有且仅有 1 个 successor，且该 successor 只有该一个 predecessor。
    # 但是在嵌套结构里，我们无法简易地统计 predecessor 数量，需要自行设计。
    #
    # 示例逻辑：深度遍历 + 遇到 loop header 不合并；遇到多 successor 不合并；否则合并到下一个 block。
    def merge_blocks_in_place(block):
        """
        递归地合并一个 block 的线性后继。
        当遇到循环头或分支时，不再合并。
        """
        successors = block.get("successors", [])
        if not successors:
            # 无后继，直接返回
            return block

        # 如果存在多个 successor，说明是分支点，不合并任何后继
        if len(successors) > 1:
            # 递归处理每个 successor
            for i, succ in enumerate(successors):
                successors[i] = merge_blocks_in_place(succ)
            block["successors"] = successors
            return block

        # 如果只有 1 个 successor，则尝试合并
        single_succ = successors[0]
        if is_loop_header(block):
            # 如果当前 block 是 loop header，不向后合并，只是递归处理后继
            block["successors"][0] = merge_blocks_in_place(single_succ)
            return block
        if is_loop_header(single_succ):
            # 如果后继是 loop header，也不合并，只是递归处理后继
            block["successors"][0] = merge_blocks_in_place(single_succ)
            return block

        # 到这里，意味着我们可以把 single_succ 跟当前块合并
        block["label"] = block["label"] + "\n" + single_succ["label"]
        # 把 single_succ 的 successors 赋给当前块
        block["successors"] = single_succ.get("successors", [])

        # 递归处理“合并后”依然存在的后继（可能还是一个 list）
        if block["successors"]:
            new_succ_list = []
            for succ in block["successors"]:
                new_succ_list.append(merge_blocks_in_place(succ))
            block["successors"] = new_succ_list

        return block

    #=== 4. 针对最外层的 blocks 做处理 ===#
    #  4.1 过滤掉不可达节点
    if "blocks" in cfg:
        cfg["blocks"] = filter_connected_blocks(cfg["blocks"])

    #  4.2 合并块：因为是多 block，需要逐个处理，然后再把处理结果放回 cfg["blocks"] 
    #      同时，新的根块可能因为合并也会改变，所以我们需要重新搜集并替换
    if "blocks" in cfg and cfg["blocks"]:
        merged = []
        for b in cfg["blocks"]:
            merged_block = merge_blocks_in_place(b)
            merged.append(merged_block)
        cfg["blocks"] = merged

    #=== 5. 递归处理 functions 与 classes ===#
    if "functions" in cfg:
        for func in cfg["functions"]:
            process_cfg(func)

    if "classes" in cfg:
        for cls in cfg["classes"]:
            process_cfg(cls)

    return cfg

#=============================
# 下面是示例读取并处理文件的逻辑
#=============================
import os
import json

for file in os.listdir("llm_cfg"):
    path = os.path.join("llm_cfg", file)
    if not os.path.isfile(path):
        continue

    print("Processing", file)
    with open(path, "r", encoding="utf-8") as f:
        try:
            llm_cfg = json.load(f)
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
            continue

    try:
        process_cfg(llm_cfg)
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")
        continue

    os.makedirs("merged_llm_cfg", exist_ok=True)
    output_path = os.path.join("merged_llm_cfg", file)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(llm_cfg, f, indent=2, ensure_ascii=False)


Processing 95.json
Processing 110.json
Processing 94.json
Processing 38.json
Processing 21.json
Processing 121.json
Processing 72.json
Processing 67.json
Processing 4.json
Processing 74.json
Processing 116.json
Processing 40.json
Processing 14.json
Processing 7.json
Processing 17.json
Processing 107.json
Processing 89.json
Processing 80.json
Processing 98.json
Processing 106.json
Processing 96.json
Processing 123.json
Processing 28.json
Processing 45.json
Processing 13.json
Processing 2.json
Processing 90.json
Processing 34.json
Processing 37.json
Processing 82.json
Processing 105.json
Processing 122.json
Processing 24.json
Processing 73.json
Processing 79.json
Processing 118.json
Processing 87.json
Processing 54.json
Processing 59.json
Processing 85.json
Processing 97.json
Processing 71.json
Processing 8.json
Processing 10.json
Processing 26.json
Processing 33.json
Processing 88.json
Processing 104.json
Processing 39.json
Processing 102.json
Processing 3.json
Processing 42.json
Proces