### Easy Generate CFG

#### LLM

效果不太好，还是需要我们一步步进行处理！

step1 先将文件的嵌套类，方法给找到

In [1]:
import json
import os

from concurrent.futures import as_completed, ThreadPoolExecutor
from multiprocessing import cpu_count
from functools import partial

# 这里假设你有一个自己封装的 get_llm_answers 函数
# 请根据实际情况导入
from llm import get_llm_answers

from tqdm import tqdm


def get_step1_prompt(code_text: str, program_language: str):
    """
    生成第一步的 Prompt，用于让大模型识别所有类和函数（以及嵌套关系）。
    注意，这里去掉了原先的三重反引号。
    """
    code_lines = code_text.splitlines()
    code_lines_json = [{
        "line": i + 1,
        "code": line
    } for i, line in enumerate(code_lines)]

    prompt = f"""
You are given a piece of {program_language} code. Your goal is to find all the nested classes and methods in the code.

Please return the result in JSON format, your output should be the following format:

{{
    "name": "example_script",  // Name of the script or function
    "type": "CFG",
    "start_line": number,
    "end_line": number,
    "functions": [
      {{
        "name": "function_name",
        "type": "function",
        "start_line": number,
        "end_line": number,
        "functions": [],         // Nested functions
        "classes": []            // Nested classes
      }}
    ],
    "classes": [
      {{
        "name": "class_name",
        "type": "class",
        "start_line": number,
        "end_line": number,
        "functions": [           // Methods of the class
          {{
            "name": "method_name",
            "type": "function",
            "start_line": number,
            "end_line": number,
            "functions": [],     // Nested functions
            "classes": []        // Nested classes
          }}
        ]
      }}
    ]
}}

The code lines are:
{json.dumps(code_lines_json, indent=2)}

IMPORTANT: Make sure that the nested classes and methods are in the correct level. For example, if a function is nested in another class, the function should be in the nested class's functions list.
Besides, if a class is nested in another class, the class should be in the nested class's classes list.
"""
    return prompt.strip()


def find_nested_classes_and_methods(code_text: str, program_language: str):
    """
    调用 LLM，让其识别文件中的嵌套类、函数，并返回 JSON 结构。
    """
    prompt = get_step1_prompt(code_text, program_language)
    response = get_llm_answers(prompt, model_name="gpt-4o", require_json=True)
    nested_classes_and_methods = json.loads(response)
    return nested_classes_and_methods


def process_file_with_chain_of_thought(code_text: str, program_language: str):
    """
    读取代码文本 -> 让大模型找出所有嵌套类/函数 -> 返回 JSON 结构
    """
    step1_result = find_nested_classes_and_methods(code_text, program_language)
    return step1_result


def get_code_by_line_range(code_block, code):
    """
    取出 code_block 代表的行范围(start_line~end_line)，并且排除其内部嵌套类/函数的所有行。
    最终保留一组 { "lineno", "line" } 数组，方便后续做 CFG 时保留原始行号。
    """
    code_lines = code.splitlines()
    start_line = code_block["start_line"]
    # 这里根据实际情况决定 end_line 是否 +1
    end_line = code_block["end_line"]

    # 先把区间内每行加入集合
    line_set = set(range(start_line, end_line + 1))

    # 从集合中排除掉所有嵌套类/函数的行
    for func in code_block.get("functions", []):
        func_start_line = func.get("start_line", 0)
        func_end_line = func.get("end_line", 0)
        line_set.difference_update(range(func_start_line, func_end_line + 1))

    for cls in code_block.get("classes", []):
        cls_start_line = cls.get("start_line", 0)
        cls_end_line = cls.get("end_line", 0)
        line_set.difference_update(range(cls_start_line, cls_end_line + 1))

    # 剩余行号排序后，保存 { lineno, line } 到 simplified_code
    ordered_lines = sorted(line_set)
    simplified_code_array = []
    for lineno in ordered_lines:
        if 1 <= lineno <= len(code_lines):
            line_content = code_lines[lineno - 1]
        else:
            line_content = ""
        simplified_code_array.append({
            "lineno": lineno,
            "line": line_content
        })

    # 将数组存进 code_block 中
    code_block["simplified_code"] = simplified_code_array


def recursive_get_code_by_line_range(code_block, code):
    """
    递归地为当前块及其所有子类、子函数，计算并存储 simplified_code（带原始行号）。
    """
    get_code_by_line_range(code_block, code)
    for func in code_block.get("functions", []):
        recursive_get_code_by_line_range(func, code)
    for cls in code_block.get("classes", []):
        recursive_get_code_by_line_range(cls, code)


def print_simplified_code(code_block: dict, indent=0):
    """
    递归打印 simplified_code 的内容（仅用于调试或查看），保留行号和内容。
    """
    prefix = " " * indent
    simplified_lines = code_block.get("simplified_code", [])
    print(prefix + "简化后的代码 (行号 -> 内容):")
    for item in simplified_lines:
        print(prefix + f"{item['lineno']:4d}: {item['line']}")

    # 递归处理嵌套的类
    for class_block in code_block.get("classes", []):
        print(prefix + f"\n类 {class_block.get('name', '')}:")
        print_simplified_code(class_block, indent + 2)

    # 递归处理嵌套的函数
    for function_block in code_block.get("functions", []):
        print(prefix + f"\n函数 {function_block.get('name', '')}:")
        print_simplified_code(function_block, indent + 2)


def get_code_cfg_prompt(line_array, program_language):
    """
    给 LLM 的 Prompt，要求其基于该 line_array 生成 CFG 并返回 JSON。
    line_array 的格式形如:
      [
        {"lineno": 10, "line": "def foo():"},
        {"lineno": 11, "line": "..."},
        ...
      ]

    去掉了三重反引号。
    """
    code_as_json = json.dumps(line_array, indent=2)
    prompt = f"""
You will be given a piece of {program_language} code in the form of a JSON array. Each element has two fields:
  - "lineno": the original line number in the code
  - "line": the actual code text on that line

Your goal is to generate a Control Flow Graph (CFG) for this code and output the result as JSON. Here are the specific requirements:

1. Input Format:
   The code is presented as a JSON array of objects, each with "lineno" (int) and "line" (string). For example:

{code_as_json}

(This is the code you need to analyze.)

2. Definition of Basic Blocks:
   - A basic block can contain one or more “continuous and unbranched” statements.
   - Whenever you encounter a statement that causes a flow jump or branch (e.g., if-else, for-while, try-except-finally, with-as, match-case, break-continue-return, etc.), you should start a new basic block.

3. JSON Output Structure:
   Your output must strictly follow this JSON format, with no additional text or explanation:

"blocks": [
  {{
    "id": 1,
    "start_line": 1,
    "end_line": 1,
    "label": "... code of block ...",
    "successors": [
      {{
        "id": 2,
        "start_line": 2,
        "end_line": 3,
        "label": "... code of block ...",
        "successors": [...]
      }}
    ]
  }}
]

   - id: an integer starting from 1, incrementing by 1 for each block.
   - start_line: the first line number (from the input) that belongs to this block.
   - end_line: the last line number (from the input) that belongs to this block.
   - label: the exact code snippet (all lines) inside this block, unchanged from the input lines.
   - successors: a list of nested blocks that may execute after this block. Each item in this list is itself a block with the same structure: "id", "start_line", "end_line", "label", and "successors".

4. Branch Structures:
   - if-else: for if condition: ... else: ..., both the if body and the else body should be separate blocks. The if block’s "successors" should include both branches as nested block objects.
   - for-while: the loop body and the statement(s) following the loop should be in different blocks, with correct flow back to the loop condition if it continues, or forward to the next block if it terminates.
   - try-except-finally: each try, except, and finally block should be identified separately, showing normal and exceptional flows in successors.
   - with-as: the code inside the with statement and the code after the with block should be separate blocks.
   - match-case: treat each case body as a separate nested block in successors.
   - break-continue-return: these statements jump to outside of the loop, back to the loop condition, or end the function. If the function ends, successors can be an empty list.

5. Final Output:
   - Ensure your output is valid JSON (only one root object, containing "blocks").
   - Do not add extra text or explanation—only the JSON object itself.
   - Each block's start_line and end_line must map correctly back to the lineno values in the input JSON array.

Your task: Parse the input line-array, identify all basic blocks with correct start_line, end_line, and label, then produce a single JSON object with the structure above.
"""
    return prompt.strip()


def get_single_block_cfg(code_block, program_language):
    """
    调用 LLM 获取当前 code_block 的 CFG。这里的 simplified_code 是行号和文本的数组。
    """
    line_array = code_block.get("simplified_code", [])
    if not line_array:
        code_block["blocks"] = []
        return

    prompt = get_code_cfg_prompt(line_array, program_language)
    response = get_llm_answers(prompt, model_name="gpt-4o", require_json=True)
    blocks_json = json.loads(response)
    code_block["blocks"] = blocks_json.get("blocks", [])


def recursive_get_each_block_cfg(code_block, program_language):
    """
    递归获取每个代码块（文件级、类级、函数级）的 CFG。
    """
    get_single_block_cfg(code_block, program_language)
    for cls in code_block.get("classes", []):
        recursive_get_each_block_cfg(cls, program_language)
    for func in code_block.get("functions", []):
        recursive_get_each_block_cfg(func, program_language)


def main():
    """
    演示入口。根据实际需求修改 source_code_dir 和文件范围等。
    这里仅示例对 python 文件进行处理，并将结果输出到 JSON。
    """
    source_code_dir = "../../dataset/cangjie"
    target_dir = "llm_cfg_with_line_no"
    os.makedirs(target_dir, exist_ok=True)

    # 示例：我们只处理 0.py ~ 199.py 这 200 个文件
    files = []
    for i in range(200):
        py_file = os.path.join(source_code_dir, f"{i}.cj")
        out_file = os.path.join(target_dir, f"{i}.json")
        files.append((py_file, out_file))

    def process_single_file(source_file, target_file):
        if not os.path.exists(source_file):
            return
        if os.path.exists(target_file):
            # 如果目标文件已存在，可以选择跳过，或覆盖，按需决定
            return

        print("Processing", source_file)
        with open(source_file, 'r', encoding='utf-8') as f:
            code = f.read()

        # Step 1: 让大模型找出所有类 / 函数（包含嵌套）
        step1_result = process_file_with_chain_of_thought(code, "cangjie")

        # Step 2: 给每个类 / 函数（及顶层）提取 simplified code（排除嵌套代码行，但保留原始行号）
        recursive_get_code_by_line_range(step1_result, code)

        # Step 3: 对每个简化后的代码块，调用 LLM 生成 CFG
        recursive_get_each_block_cfg(step1_result, "cangjie")

        # 可选：去重逻辑，避免出现重复的 blocks
        def remove_duplicate_blocks(code_block):
            """
            删除同一层级中 (start_line, end_line) 相同的重复块，仅保留最前面一个
            """
            if "blocks" in code_block:
                seen = set()
                unique_blocks = []
                for blk in code_block["blocks"]:
                    s_line = blk.get("start_line", -1)
                    e_line = blk.get("end_line", -1)
                    key = (s_line, e_line)
                    if key not in seen:
                        seen.add(key)
                        unique_blocks.append(blk)
                code_block["blocks"] = unique_blocks

            for sub_cls in code_block.get("classes", []):
                remove_duplicate_blocks(sub_cls)
            for sub_func in code_block.get("functions", []):
                remove_duplicate_blocks(sub_func)

        remove_duplicate_blocks(step1_result)

        # 输出到 JSON
        with open(target_file, "w", encoding="utf-8") as fout:
            json.dump(step1_result, fout, indent=2, ensure_ascii=False)

    # 多线程处理所有文件（可单线程执行以更好查看输出）
    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = [executor.submit(process_single_file, src, tgt) for src, tgt in files]
        for _ in tqdm(as_completed(futures), total=len(files), desc="处理CFG文件"):
            pass

    # process_single_file(*files[0])


if __name__ == "__main__":
    main()


Processing ../../dataset/cangjie/0.cj
Processing ../../dataset/cangjie/1.cj
Processing ../../dataset/cangjie/2.cj
Processing ../../dataset/cangjie/3.cj
Processing ../../dataset/cangjie/5.cj
Processing ../../dataset/cangjie/4.cj
Processing ../../dataset/cangjie/7.cj
Processing ../../dataset/cangjie/6.cj
Processing ../../dataset/cangjie/8.cj
Processing ../../dataset/cangjie/9.cj
Processing ../../dataset/cangjie/10.cj
Processing ../../dataset/cangjie/11.cj
Processing ../../dataset/cangjie/13.cj
Processing ../../dataset/cangjie/14.cj
Processing ../../dataset/cangjie/12.cj
Processing ../../dataset/cangjie/15.cj
Processing ../../dataset/cangjie/16.cj
Processing ../../dataset/cangjie/17.cj
Processing ../../dataset/cangjie/18.cj
Processing ../../dataset/cangjie/21.cj
Processing ../../dataset/cangjie/19.cj
Processing ../../dataset/cangjie/22.cj
Processing ../../dataset/cangjie/20.cj
Processing ../../dataset/cangjie/24.cj
Processing ../../dataset/cangjie/23.cj
Processing ../../dataset/cangjie/25

处理CFG文件:   0%|          | 1/200 [00:13<44:46, 13.50s/it]

Processing ../../dataset/cangjie/112.cj


处理CFG文件:   1%|          | 2/200 [00:14<20:52,  6.32s/it]

Processing ../../dataset/cangjie/113.cj


处理CFG文件:   2%|▏         | 3/200 [00:15<12:14,  3.73s/it]

Processing ../../dataset/cangjie/114.cj
Processing ../../dataset/cangjie/115.cj


处理CFG文件:   2%|▎         | 5/200 [00:16<05:49,  1.79s/it]

Processing ../../dataset/cangjie/116.cj
Processing ../../dataset/cangjie/117.cj


处理CFG文件:   4%|▎         | 7/200 [00:16<03:38,  1.13s/it]

Processing ../../dataset/cangjie/118.cj


处理CFG文件:   4%|▍         | 8/200 [00:17<03:07,  1.02it/s]

Processing ../../dataset/cangjie/119.cj


处理CFG文件:   4%|▍         | 9/200 [00:17<02:35,  1.23it/s]

Processing ../../dataset/cangjie/120.cj


处理CFG文件:   6%|▌         | 11/200 [00:19<02:22,  1.33it/s]

Processing ../../dataset/cangjie/121.cj
Processing ../../dataset/cangjie/122.cj
Processing ../../dataset/cangjie/123.cj


处理CFG文件:   6%|▋         | 13/200 [00:20<02:05,  1.49it/s]

Processing ../../dataset/cangjie/124.cj


处理CFG文件:   7%|▋         | 14/200 [00:20<01:57,  1.58it/s]

Processing ../../dataset/cangjie/125.cj


处理CFG文件:   8%|▊         | 15/200 [00:21<01:46,  1.73it/s]

Processing ../../dataset/cangjie/126.cj


处理CFG文件:   8%|▊         | 16/200 [00:21<01:42,  1.80it/s]

Processing ../../dataset/cangjie/127.cj
Processing ../../dataset/cangjie/128.cj
Processing ../../dataset/cangjie/129.cj


处理CFG文件:  10%|█         | 20/200 [00:22<00:47,  3.75it/s]

Processing ../../dataset/cangjie/130.cj
Processing ../../dataset/cangjie/131.cj


处理CFG文件:  10%|█         | 21/200 [00:22<00:57,  3.10it/s]

Processing ../../dataset/cangjie/132.cj


处理CFG文件:  11%|█         | 22/200 [00:23<01:13,  2.42it/s]

Processing ../../dataset/cangjie/133.cj


处理CFG文件:  12%|█▏        | 23/200 [00:23<01:05,  2.70it/s]

Processing ../../dataset/cangjie/134.cj


处理CFG文件:  12%|█▎        | 25/200 [00:24<01:05,  2.69it/s]

Processing ../../dataset/cangjie/135.cj
Processing ../../dataset/cangjie/136.cj


处理CFG文件:  13%|█▎        | 26/200 [00:25<01:39,  1.75it/s]

Processing ../../dataset/cangjie/137.cj
Processing ../../dataset/cangjie/138.cj


处理CFG文件:  14%|█▍        | 28/200 [00:27<02:30,  1.14it/s]

Processing ../../dataset/cangjie/139.cj
Processing ../../dataset/cangjie/140.cj


处理CFG文件:  16%|█▌        | 31/200 [00:28<01:38,  1.71it/s]

Processing ../../dataset/cangjie/141.cj
Processing ../../dataset/cangjie/142.cj


处理CFG文件:  16%|█▌        | 32/200 [00:29<01:45,  1.59it/s]

Processing ../../dataset/cangjie/143.cj


处理CFG文件:  16%|█▋        | 33/200 [00:30<01:39,  1.67it/s]

Processing ../../dataset/cangjie/144.cj


处理CFG文件:  17%|█▋        | 34/200 [00:31<02:03,  1.35it/s]

Processing ../../dataset/cangjie/145.cj
Processing ../../dataset/cangjie/146.cj


处理CFG文件:  18%|█▊        | 36/200 [00:31<01:26,  1.91it/s]

Processing ../../dataset/cangjie/147.cj
Processing ../../dataset/cangjie/148.cj


处理CFG文件:  19%|█▉        | 38/200 [00:32<01:04,  2.52it/s]

Processing ../../dataset/cangjie/149.cj


处理CFG文件:  20%|█▉        | 39/200 [00:33<01:17,  2.08it/s]

Processing ../../dataset/cangjie/150.cj


处理CFG文件:  20%|██        | 40/200 [00:37<03:40,  1.38s/it]

Processing ../../dataset/cangjie/151.cj


处理CFG文件:  20%|██        | 41/200 [00:37<03:09,  1.19s/it]

Processing ../../dataset/cangjie/152.cj


处理CFG文件:  21%|██        | 42/200 [00:38<02:38,  1.00s/it]

Processing ../../dataset/cangjie/153.cj
Processing ../../dataset/cangjie/154.cj
Processing ../../dataset/cangjie/155.cj
Processing ../../dataset/cangjie/156.cj
Processing ../../dataset/cangjie/157.cj
Processing ../../dataset/cangjie/158.cj


处理CFG文件:  24%|██▍       | 48/200 [00:38<00:51,  2.95it/s]

Processing ../../dataset/cangjie/159.cj


处理CFG文件:  24%|██▍       | 49/200 [00:40<01:15,  2.00it/s]

Processing ../../dataset/cangjie/160.cj


处理CFG文件:  25%|██▌       | 50/200 [00:40<01:19,  1.90it/s]

Processing ../../dataset/cangjie/161.cj
Processing ../../dataset/cangjie/162.cj


处理CFG文件:  26%|██▌       | 52/200 [00:41<01:01,  2.39it/s]

Processing ../../dataset/cangjie/163.cj


处理CFG文件:  26%|██▋       | 53/200 [00:41<01:08,  2.14it/s]

Processing ../../dataset/cangjie/164.cj


处理CFG文件:  27%|██▋       | 54/200 [00:43<01:38,  1.48it/s]

Processing ../../dataset/cangjie/165.cj


处理CFG文件:  28%|██▊       | 55/200 [00:43<01:36,  1.51it/s]

Processing ../../dataset/cangjie/166.cj


处理CFG文件:  28%|██▊       | 56/200 [00:44<01:31,  1.57it/s]

Processing ../../dataset/cangjie/167.cj


处理CFG文件:  28%|██▊       | 57/200 [00:44<01:16,  1.88it/s]

Processing ../../dataset/cangjie/168.cj


处理CFG文件:  30%|██▉       | 59/200 [00:45<00:59,  2.37it/s]

Processing ../../dataset/cangjie/169.cj
Processing ../../dataset/cangjie/170.cj


处理CFG文件:  30%|███       | 60/200 [00:45<00:46,  3.00it/s]

Processing ../../dataset/cangjie/171.cj


处理CFG文件:  32%|███▏      | 63/200 [00:46<00:32,  4.19it/s]

Processing ../../dataset/cangjie/172.cj
Processing ../../dataset/cangjie/173.cj
Processing ../../dataset/cangjie/174.cj
Processing ../../dataset/cangjie/175.cj


处理CFG文件:  32%|███▎      | 65/200 [00:47<00:50,  2.67it/s]

Processing ../../dataset/cangjie/176.cj


处理CFG文件:  33%|███▎      | 66/200 [00:48<01:25,  1.57it/s]

Processing ../../dataset/cangjie/177.cj


处理CFG文件:  34%|███▍      | 69/200 [00:50<01:02,  2.09it/s]

Processing ../../dataset/cangjie/178.cj
Processing ../../dataset/cangjie/179.cj
Processing ../../dataset/cangjie/180.cj


处理CFG文件:  35%|███▌      | 70/200 [00:50<01:04,  2.02it/s]

Processing ../../dataset/cangjie/181.cj
Processing ../../dataset/cangjie/182.cj


处理CFG文件:  36%|███▌      | 72/200 [00:51<00:53,  2.38it/s]

Processing ../../dataset/cangjie/183.cj
Processing ../../dataset/cangjie/184.cj


处理CFG文件:  37%|███▋      | 74/200 [00:51<00:39,  3.19it/s]

Processing ../../dataset/cangjie/185.cj


处理CFG文件:  38%|███▊      | 77/200 [00:52<00:37,  3.24it/s]

Processing ../../dataset/cangjie/186.cj
Processing ../../dataset/cangjie/187.cj
Processing ../../dataset/cangjie/188.cj


处理CFG文件:  39%|███▉      | 78/200 [00:53<00:49,  2.45it/s]

Processing ../../dataset/cangjie/189.cj


处理CFG文件:  40%|███▉      | 79/200 [00:53<00:45,  2.66it/s]

Processing ../../dataset/cangjie/190.cj


处理CFG文件:  40%|████      | 80/200 [00:54<00:47,  2.53it/s]

Processing ../../dataset/cangjie/191.cj


处理CFG文件:  40%|████      | 81/200 [00:55<01:06,  1.78it/s]

Processing ../../dataset/cangjie/192.cj


处理CFG文件:  41%|████      | 82/200 [00:55<00:58,  2.03it/s]

Processing ../../dataset/cangjie/193.cj


处理CFG文件:  42%|████▏     | 83/200 [00:55<00:48,  2.43it/s]

Processing ../../dataset/cangjie/194.cj


处理CFG文件:  42%|████▏     | 84/200 [00:56<00:59,  1.96it/s]

Processing ../../dataset/cangjie/195.cj


处理CFG文件:  42%|████▎     | 85/200 [00:56<00:51,  2.25it/s]

Processing ../../dataset/cangjie/196.cj


处理CFG文件:  43%|████▎     | 86/200 [00:57<01:07,  1.68it/s]

Processing ../../dataset/cangjie/197.cj


处理CFG文件:  44%|████▍     | 88/200 [00:58<00:47,  2.35it/s]

Processing ../../dataset/cangjie/198.cj
Processing ../../dataset/cangjie/199.cj


处理CFG文件:  98%|█████████▊| 197/200 [05:16<00:35, 11.82s/it]

### LLM生成的代码可能可以合并

In [1]:
import os
import json

def process_cfg(cfg):
    """
    Process a CFG that uses a *nested successors* structure.
    We will:
      1. Remove unreachable blocks (only keep blocks reachable from the root).
      2. Separate loop headers from loop bodies (if desired).
      3. Merge consecutive linear blocks that have only one successor and one predecessor.
      4. Recursively process functions/classes if they exist.
    """

    #=== 1. 过滤不可达节点: 我们假设 blocks[0] 是 CFG 的根节点 ===#
    def filter_connected_blocks(blocks):
        """
        Given a list of blocks (in nested form), return only those reachable
        from the 'root' block (which we assume is blocks[0]) by traversing
        nested successors.
        """

        visited_ids = set()
        # 为了方便在后面快速通过 id 找到对应的 block 对象，我们先做一个 {id: block} 的映射
        # 同时存储所有 block 的引用（因为是嵌套的，需要把内部 successors 里的 block 也加入到此映射）
        id_to_block = {}

        def collect_all_blocks(block_list):
            for b in block_list:
                id_to_block[b["id"]] = b
                if "successors" in b:
                    collect_all_blocks(b["successors"])

        collect_all_blocks(blocks)

        # 深度优先搜索，查找所有可达节点
        def dfs(block):
            if block["id"] in visited_ids:
                return
            visited_ids.add(block["id"])
            for succ_block in block.get("successors", []):
                dfs(succ_block)

        # 假定 blocks[0] 是 root
        if blocks:
            root_block = blocks[0]
            dfs(root_block)

        # 现在我们只保留被 visited_ids 覆盖到的节点，并且需要“剪枝”不在 visited_ids 中的后继
        def filter_nested(block_list):
            """在嵌套结构中移除不可达节点。"""
            filtered = []
            for b in block_list:
                if b["id"] in visited_ids:
                    # 递归处理 successors
                    new_successors = filter_nested(b.get("successors", []))
                    filtered.append({
                        "id": b["id"],
                        "label": b["label"],
                        "successors": new_successors
                    })
            return filtered

        return filter_nested(blocks)

    #=== 2. 判断循环头（示例仅以 "for" / "while" 关键字简单判断） ===#
    def is_loop_header(block):
        """
        A naive check: if the block's label starts with 'for' or 'while'
        (or contains those keywords in a relevant way), treat it as a loop header.
        """
        code_str = block["label"].strip()
        if code_str.startswith("for ") or code_str.startswith("while "):
            return True
        return False

    #=== 3. 合并逻辑（循环头和循环体暂时不做复杂拆分，仅演示思路） ===#
    # 在嵌套结构中，“连续的线性块”通常表现为一个 block 有且仅有 1 个 successor，且该 successor 只有该一个 predecessor。
    # 但是在嵌套结构里，我们无法简易地统计 predecessor 数量，需要自行设计。
    #
    # 示例逻辑：深度遍历 + 遇到 loop header 不合并；遇到多 successor 不合并；否则合并到下一个 block。
    def merge_blocks_in_place(block):
        """
        递归地合并一个 block 的线性后继。
        当遇到循环头或分支时，不再合并。
        """
        successors = block.get("successors", [])
        if not successors:
            # 无后继，直接返回
            return block

        # 如果存在多个 successor，说明是分支点，不合并任何后继
        if len(successors) > 1:
            # 递归处理每个 successor
            for i, succ in enumerate(successors):
                successors[i] = merge_blocks_in_place(succ)
            block["successors"] = successors
            return block

        # 如果只有 1 个 successor，则尝试合并
        single_succ = successors[0]
        if is_loop_header(block):
            # 如果当前 block 是 loop header，不向后合并，只是递归处理后继
            block["successors"][0] = merge_blocks_in_place(single_succ)
            return block
        if is_loop_header(single_succ):
            # 如果后继是 loop header，也不合并，只是递归处理后继
            block["successors"][0] = merge_blocks_in_place(single_succ)
            return block

        # 到这里，意味着我们可以把 single_succ 跟当前块合并
        block["label"] = block["label"] + "\n" + single_succ["label"]
        # 把 single_succ 的 successors 赋给当前块
        block["successors"] = single_succ.get("successors", [])

        # 递归处理“合并后”依然存在的后继（可能还是一个 list）
        if block["successors"]:
            new_succ_list = []
            for succ in block["successors"]:
                new_succ_list.append(merge_blocks_in_place(succ))
            block["successors"] = new_succ_list

        return block

    #=== 4. 针对最外层的 blocks 做处理 ===#
    #  4.1 过滤掉不可达节点
    if "blocks" in cfg:
        cfg["blocks"] = filter_connected_blocks(cfg["blocks"])

    #  4.2 合并块：因为是多 block，需要逐个处理，然后再把处理结果放回 cfg["blocks"] 
    #      同时，新的根块可能因为合并也会改变，所以我们需要重新搜集并替换
    if "blocks" in cfg and cfg["blocks"]:
        merged = []
        for b in cfg["blocks"]:
            merged_block = merge_blocks_in_place(b)
            merged.append(merged_block)
        cfg["blocks"] = merged

    #=== 5. 递归处理 functions 与 classes ===#
    if "functions" in cfg:
        for func in cfg["functions"]:
            process_cfg(func)

    if "classes" in cfg:
        for cls in cfg["classes"]:
            process_cfg(cls)

    return cfg

#=============================
# 下面是示例读取并处理文件的逻辑
#=============================
import os
import json

for file in os.listdir("llm_cfg_with_line_no"):
    path = os.path.join("llm_cfg_with_line_no", file)
    if not os.path.isfile(path):
        continue

    print("Processing", file)
    with open(path, "r", encoding="utf-8") as f:
        try:
            llm_cfg = json.load(f)
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
            continue

    try:
        process_cfg(llm_cfg)
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")
        continue

    os.makedirs("merged_llm_cfg_with_line_no", exist_ok=True)
    output_path = os.path.join("merged_llm_cfg_with_line_no", file)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(llm_cfg, f, indent=2, ensure_ascii=False)


Processing 95.json
Processing 110.json
Processing 160.json
Error processing 160.json: 'id'
Processing 94.json
Processing 38.json
Processing 21.json
Processing 187.json
Processing 121.json
Processing 72.json
Processing 132.json
Processing 67.json
Processing 149.json
Processing 147.json
Processing 135.json
Processing 4.json
Processing 74.json
Processing 116.json
Processing 40.json
Processing 178.json
Processing 14.json
Processing 7.json
Processing 166.json
Processing 31.json
Processing 17.json
Processing 167.json
Processing 107.json
Processing 156.json
Processing 89.json
Processing 183.json
Processing 193.json
Processing 176.json
Processing 162.json
Processing 80.json
Processing 136.json
Processing 171.json
Processing 98.json
Processing 106.json
Processing 141.json
Processing 133.json
Processing 152.json
Processing 96.json
Processing 123.json
Processing 28.json
Processing 150.json
Processing 45.json
Processing 13.json
Processing 169.json
Processing 175.json
Processing 198.json
Processing