In [3]:
"""
Multi-threaded script for Python code analysis:

1) LLM-based AST generation (via CFG + partial block approach):
   - parse_cfg_structure() => get line ranges of classes/functions
   - build_ast_from_cfg() => recursively exclude child function/class lines from the parent block,
     parse only the remaining lines, then insert function/class placeholders back in the correct position.
   - Each node has global-level start_token/end_token, thanks to a single global tokenize_code_with_lines().

2) Tree-sitter-based static AST:
   - generate_tree_sitter_ast() => returns {type, label, children}.

3) Compare snippet-level labels (optional).

4) Save both ASTs as JSON.

5) Multi-file parallel processing with ThreadPoolExecutor.

See the "llm_build_ast_from_tokens" function's prompt – we keep it intact as requested.
"""

import os
import sys
import json
from typing import Any, Dict, List, Tuple
import re
import concurrent.futures
from multiprocessing import cpu_count

from tqdm import tqdm

###############################################################################
#                           LLM interface (stub)                              #
###############################################################################
"""
Replace 'get_llm_answers' with your actual LLM API call or function.
Here we just provide a stub or minimal placeholder.
"""
try:
    from llm import get_llm_answers
except ImportError:
    # If no llm.py found, define a placeholder
    def get_llm_answers(prompt, model_name="", require_json=False, temperature=0):
        # Return a minimal JSON for demonstration
        # In reality, you'd implement the actual LLM call (OpenAI, local model, etc.)
        return "{}"


###############################################################################
#                     Tree-sitter initialization (Python)                     #
##############################################################################


###############################################################################
#   4) 保持原样: llm_build_ast_from_tokens() (prompt 不变, 勿改)              #
###############################################################################
def llm_build_ast(code) -> Dict[str, Any]:
    """
    给定 tokens列表 => 调用 LLM => 生成 JSON AST.
    - top_level: 是否最外层(只有最外层允许 'module'), 否则用 'block' 等
    """

    # 构造 prompt
    allowed_types = [
        "aliased_import", "argument_list", "as_pattern", "as_pattern_target", "assert_statement",
        "assignment", "attribute", "augmented_assignment", "await", "binary_operator", "block",
        "boolean_operator", "break_statement", "call", "class_definition", "comment",
        "comparison_operator", "concatenated_string", "conditional_expression",
        "continue_statement", "decorated_definition", "decorator", "default_parameter",
        "delete_statement", "dictionary", "dictionary_comprehension", "dictionary_splat",
        "dictionary_splat_pattern", "dotted_name", "elif_clause", "ellipsis", "else_clause",
        "escape_interpolation", "escape_sequence", "except_clause", "expression_list",
        "expression_statement", "false", "finally_clause", "float", "for_in_clause", "for_statement",
        "format_specifier", "function_definition", "future_import_statement",
        "generator_expression", "generic_type", "global_statement", "identifier", "if_clause",
        "if_statement", "import_from_statement", "import_prefix", "import_statement", "integer",
        "interpolation", "keyword_argument", "keyword_separator", "lambda", "lambda_parameters",
        "line_continuation", "list", "list_comprehension", "list_splat", "list_splat_pattern",
        "module", "named_expression", "none", "nonlocal_statement", "not_operator", "pair",
        "parameters", "parenthesized_expression", "pass_statement", "pattern_list", "raise_statement",
        "relative_import", "return_statement", "set", "set_comprehension", "slice", "string",
        "string_content", "string_end", "string_start", "subscript", "true", "try_statement",
        "tuple", "tuple_pattern", "type", "type_parameter", "typed_default_parameter", "typed_parameter",
        "unary_operator", "union_type", "while_statement", "with_clause", "with_item", "with_statement",
        "yield"
    ]
    allowed_types_str = ", ".join(allowed_types)

    top_level_instruction = "Exactly one 'module' node can appear at the root. Use 'block' if nested.\n"

    prompt = (
        "Below is the code snippet:\n"
        f"{code}\n\n"
        "Create a JSON-based AST with these fields:\n"
        f"- 'type': must be in {{{allowed_types_str}}}\n"
        "- 'code'\n"
        "- 'children' (array)\n\n"
        "Return valid JSON only.\n"
    )
    
    prompt += "\nAt the root, use 'module'. Do not nest multiple 'module'.\n" + top_level_instruction

    try:
        llm_output = get_llm_answers(
            prompt,
            model_name="gpt-4o",
            require_json=True,
            temperature=0
        )
        ast_dict = json.loads(llm_output)
        return ast_dict

    except Exception as e:
        print(f"[Error] llm_build_ast_from_tokens: {e}")
        return {
            "type": "ErrorNode",
            "code": code,
            "children": []
        }
    
def generate_llm_ast(code: str) -> Dict[str,Any]:
    llm_ast = llm_build_ast(code)
    return llm_ast


###############################################################################
#     8) 单文件处理: 生成LLM AST, Tree-sitter AST, 存JSON,可选对比             #
###############################################################################
def process_llm_ast(code: str, file_path: str) -> Dict[str,Any]:
    """
    生成 LLM AST, 并保存到 JSON
    """
    llm_dir = "llm_ast"
    os.makedirs(llm_dir, exist_ok=True)
    llm_path = os.path.join(llm_dir, os.path.basename(file_path) + ".json")

    if os.path.exists(llm_path):
        with open(llm_path, "r", encoding="utf-8") as f:
            llm_ast = json.load(f)
        # print(f"[LLM AST cached] => {llm_path}")
        return llm_ast

    llm_ast = generate_llm_ast(code)
    with open(llm_path, "w", encoding="utf-8") as fout:
        json.dump(llm_ast, fout, indent=4, ensure_ascii=False)
    return llm_ast

def process_single_file(file_path: str):
    """
    1) read code
    2) LLM AST
    3) Tree-sitter AST
    4) optionally compare
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()
    except Exception as e:
        print(f"[Error reading {file_path}]: {e}")
        return

    llm_ast = process_llm_ast(code, file_path)

###############################################################################
#                            9) main() 并行处理                               #
###############################################################################
def main():
    source_dir = "../dataset/python"  # 修改为你的实际源文件目录
    if not os.path.isdir(source_dir):
        print(f"[Error] Directory {source_dir} does not exist.")
        return

    # 收集所有 .py 文件
    files = [f for f in os.listdir(source_dir) if f.endswith(".py")][:200]
    print(f"Found {len(files)} Python files in {source_dir}.")

    with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files), desc="Processing files")
        for fname in files:
            full_path = os.path.join(source_dir, fname)
            future = executor.submit(process_single_file, full_path)
            future.add_done_callback(lambda _: pbar.update(1))
            futures.append(future)
        concurrent.futures.wait(futures)
        pbar.close()

if __name__ == "__main__":
    main()

Found 200 Python files in ../dataset/python.


Processing files:  39%|███▉      | 78/200 [00:36<00:23,  5.12it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 754 column 27 (char 21951)


Processing files:  40%|███▉      | 79/200 [00:36<00:30,  3.97it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 737 column 1 (char 30758)


Processing files:  41%|████      | 82/200 [00:38<00:58,  2.01it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 670 column 38 (char 23347)


Processing files:  42%|████▎     | 85/200 [00:39<00:45,  2.50it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 612 column 27 (char 22599)


Processing files:  44%|████▎     | 87/200 [00:40<00:34,  3.30it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 404 column 23 (char 18805)


Processing files:  44%|████▍     | 88/200 [00:41<00:57,  1.95it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 671 column 39 (char 24094)
[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 608 column 16 (char 19523)


Processing files:  46%|████▋     | 93/200 [00:42<00:28,  3.71it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 748 column 39 (char 24322)


Processing files:  47%|████▋     | 94/200 [00:42<00:28,  3.67it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 741 column 22 (char 30081)


Processing files:  48%|████▊     | 96/200 [00:42<00:22,  4.62it/s]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 715 column 1 (char 28015)


Processing files:  50%|█████     | 101/200 [00:43<00:18,  5.48it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 611 column 35 (char 30174)
[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 668 column 1 (char 31231)
[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 838 column 24 (char 26406)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 439 column 23 (char 18629)


Processing files:  53%|█████▎    | 106/200 [00:44<00:11,  8.01it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 552 column 18 (char 19166)
[Error] llm_build_ast_from_tokens: Expecting value: line 464 column 18 (char 17739)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 189 column 35 (char 15434)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 653 column 27 (char 27173)


Processing files:  55%|█████▌    | 110/200 [00:44<00:10,  8.70it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 698 column 19 (char 19304)
[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 483 column 1 (char 23411)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 558 column 19 (char 22841)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 780 column 39 (char 29669)
[Error] llm_build_ast_from_tokens: Expecting value: line 724 column 70 (char 32044)
[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 697 column 22 (char 24466)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 771 column 23 (char 27452)


Processing files:  57%|█████▊    | 115/200 [00:45<00:07, 11.43it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 3 column 11 (char 32)


Processing files:  58%|█████▊    | 117/200 [00:45<00:10,  7.59it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 767 column 43 (char 25867)


Processing files:  60%|█████▉    | 119/200 [00:45<00:11,  7.02it/s]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 603 column 1 (char 27447)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 437 column 27 (char 18771)


Processing files:  60%|██████    | 121/200 [00:46<00:14,  5.30it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 734 column 18 (char 28646)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 645 column 45 (char 32639)


Processing files:  62%|██████▏   | 124/200 [00:47<00:12,  5.89it/s]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 774 column 44 (char 23555)
[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 604 column 1 (char 21276)


Processing files:  65%|██████▌   | 130/200 [00:47<00:08,  7.86it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 678 column 43 (char 24551)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 800 column 51 (char 26353)


Processing files:  66%|██████▋   | 133/200 [00:48<00:09,  6.73it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 769 column 23 (char 21704)
[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 519 column 1 (char 20776)


Processing files:  68%|██████▊   | 136/200 [00:48<00:08,  7.65it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 764 column 35 (char 23188)
[Error] llm_build_ast_from_tokens: Expecting value: line 633 column 28 (char 22431)


Processing files:  70%|███████   | 140/200 [00:49<00:10,  5.59it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 517 column 23 (char 22086)


Processing files:  72%|███████▏  | 144/200 [00:49<00:08,  6.47it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 689 column 52 (char 35654)


Processing files:  73%|███████▎  | 146/200 [00:50<00:09,  5.72it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 571 column 29 (char 34876)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 687 column 11 (char 25443)


Processing files:  74%|███████▍  | 148/200 [00:50<00:07,  6.99it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 717 column 30 (char 26683)


Processing files:  75%|███████▌  | 150/200 [00:51<00:08,  5.65it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 475 column 27 (char 18846)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 181 column 23 (char 17037)


Processing files:  76%|███████▋  | 153/200 [00:52<00:11,  4.20it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 874 column 39 (char 34735)
[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 424 column 18 (char 16567)


Processing files:  78%|███████▊  | 156/200 [00:53<00:13,  3.19it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 619 column 27 (char 20574)


Processing files:  79%|███████▉  | 158/200 [00:54<00:18,  2.28it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 550 column 23 (char 19455)


Processing files:  81%|████████  | 162/200 [01:00<00:42,  1.11s/it]

[Error] llm_build_ast_from_tokens: Expecting value: line 782 column 30 (char 25588)


Processing files:  82%|████████▏ | 163/200 [01:00<00:32,  1.14it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 683 column 30 (char 24758)


Processing files:  82%|████████▏ | 164/200 [01:01<00:36,  1.02s/it]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 614 column 1 (char 23733)


Processing files:  82%|████████▎ | 165/200 [01:02<00:35,  1.00s/it]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 698 column 27 (char 20550)


Processing files:  84%|████████▎ | 167/200 [01:03<00:19,  1.70it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 533 column 26 (char 18411)


Processing files:  84%|████████▍ | 168/200 [01:03<00:20,  1.56it/s]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 709 column 12 (char 22972)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 725 column 23 (char 23890)


Processing files:  86%|████████▌ | 171/200 [01:04<00:10,  2.83it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 655 column 31 (char 26727)


Processing files:  86%|████████▌ | 172/200 [01:04<00:10,  2.58it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 757 column 15 (char 27927)


Processing files:  86%|████████▋ | 173/200 [01:05<00:09,  2.81it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 767 column 22 (char 34551)


Processing files:  88%|████████▊ | 176/200 [01:06<00:07,  3.18it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 736 column 35 (char 29086)
[Error] llm_build_ast_from_tokens: Expecting value: line 800 column 38 (char 32946)


Processing files:  88%|████████▊ | 177/200 [01:06<00:07,  3.26it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 737 column 1 (char 37263)


Processing files:  90%|████████▉ | 179/200 [01:07<00:10,  2.06it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 830 column 15 (char 23984)


Processing files:  90%|█████████ | 181/200 [01:08<00:09,  2.06it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 826 column 1 (char 30599)


Processing files:  91%|█████████ | 182/200 [01:09<00:09,  1.98it/s]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 682 column 1 (char 25017)


Processing files:  92%|█████████▏| 183/200 [01:10<00:11,  1.54it/s]

[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 617 column 1 (char 19151)
[Error] llm_build_ast_from_tokens: Expecting property name enclosed in double quotes: line 753 column 22 (char 26087)
[Error] llm_build_ast_from_tokens: Expecting value: line 597 column 20 (char 22268)


Processing files:  93%|█████████▎| 186/200 [01:10<00:05,  2.67it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 693 column 35 (char 21861)


Processing files:  94%|█████████▍| 188/200 [01:12<00:06,  1.79it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 612 column 50 (char 27403)


Processing files:  95%|█████████▌| 190/200 [01:13<00:05,  1.77it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 611 column 20 (char 27146)


Processing files:  96%|█████████▌| 191/200 [01:14<00:04,  1.86it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 803 column 27 (char 32923)


Processing files:  96%|█████████▌| 192/200 [01:15<00:04,  1.65it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 179 column 19 (char 17366)


Processing files:  96%|█████████▋| 193/200 [01:18<00:09,  1.32s/it]

[Error] llm_build_ast_from_tokens: Expecting ',' delimiter: line 689 column 1 (char 34258)


Processing files:  97%|█████████▋| 194/200 [01:18<00:06,  1.08s/it]

[Error] llm_build_ast_from_tokens: Expecting value: line 685 column 1 (char 28781)


Processing files:  98%|█████████▊| 195/200 [01:19<00:05,  1.00s/it]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 783 column 39 (char 28010)


Processing files:  98%|█████████▊| 196/200 [01:20<00:03,  1.07it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 574 column 15 (char 20502)


Processing files:  99%|█████████▉| 198/200 [01:20<00:01,  1.89it/s]

[Error] llm_build_ast_from_tokens: Expecting value: line 836 column 58 (char 35360)
[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 701 column 35 (char 26087)


Processing files: 100%|█████████▉| 199/200 [01:26<00:01,  1.80s/it]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 631 column 15 (char 20621)


Processing files: 100%|██████████| 200/200 [01:29<00:00,  2.23it/s]

[Error] llm_build_ast_from_tokens: Unterminated string starting at: line 752 column 39 (char 29440)





In [12]:
import os
import json

llm_ast_dir = "llm_ast"
source_dir = "../dataset/python"

# 定义文件长度分类的字典
length_categories = {
    "short": {"range": (0, 50), "error_count": 0, "total_count": 0},
    "medium": {"range": (51, 200), "error_count": 0, "total_count": 0},
    "long": {"range": (201, 1001), "error_count": 0, "total_count": 0}
}

for file in os.listdir(llm_ast_dir):
    if file.endswith(".json"):
        with open(os.path.join(llm_ast_dir, file), "r", encoding="utf-8") as f:
            llm_ast = json.load(f)

        source_file = os.path.join(source_dir, file.replace(".json", ""))
        if not os.path.exists(source_file):
            print(f"Source file not found: {source_file}")
            continue
    
        # 统计源文件行数
        with open(source_file, "r", encoding="utf-8") as src_f:
            num_lines = sum(1 for _ in src_f)  # 高效逐行计数

        # 根据行数分类
        for category, data in length_categories.items():
            min_len, max_len = data["range"]
            if min_len <= num_lines <= max_len:
                data["total_count"] += 1
                if llm_ast["type"] == "ErrorNode":
                    data["error_count"] += 1
                break

# 输出结果
for category, data in length_categories.items():
    print(f"{category.capitalize()}文件（{data['range'][0]}-{data['range'][1]}行）中的ErrorNode数量: {data['error_count']}/{data['total_count']}")

Short文件（0-50行）中的ErrorNode数量: 7/53
Medium文件（51-200行）中的ErrorNode数量: 39/109
Long文件（201-1001行）中的ErrorNode数量: 35/37


In [None]:
import os
import json
from typing import Any, Dict, List, Tuple
import re
import concurrent.futures
from multiprocessing import cpu_count
from tqdm import tqdm

###############################################################################
#                           LLM interface (stub)                              #
###############################################################################
try:
    from llm import get_llm_answers
except ImportError:
    # If no llm.py found, define a placeholder
    def get_llm_answers(prompt, model_name="", require_json=False, temperature=0):
        # Return a minimal JSON for demonstration
        return "{}"

###############################################################################
#                   1) 全局分词，含行号 -> global_tokens                       #
###############################################################################
def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'  # 标识符
        r'[0-9]+|'        # 数字
        r'"[^"]*"|'       # 双引号字符串
        r"'[^']*'|"       # 单引号字符串
        r'\\[ntr]|'       # 转义符 \n \t \r
        r'//.*|'          # 单行注释 (如 C++/Java/JS 风格)
        r'/\*.*?\*/|'     # 多行注释 (如 C 风格)
        r'\n|\r|\t|'      # 换行/回车/制表符
        r'\S'             # 其他符号(如 +, -, {, }, 以及任何其它非空白字符)
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

###############################################################################
#   2) 直接调用 LLM 生成 AST (无需 CFG 结构)                                 #
###############################################################################
def llm_build_ast_from_tokens(tokens_with_offset: List[Tuple[int, int, str]], top_level=True) -> Dict[str, Any]:
    """
    给定 tokens列表 => 调用 LLM => 生成 JSON AST.
    - top_level: 是否最外层(只有最外层允许 'module'), 否则用 'block' 等
    """
    indexed_tokens = [(i, t[2]) for i, t in enumerate(tokens_with_offset)]
    token_info = "\n".join(f"{i}: {text}" for (i, text) in indexed_tokens)

    # 构造 prompt
    allowed_types = [
        "aliased_import", "argument_list", "as_pattern", "as_pattern_target", "assert_statement",
        "assignment", "attribute", "augmented_assignment", "await", "binary_operator", "block",
        "boolean_operator", "break_statement", "call", "class_definition", "comment",
        "comparison_operator", "concatenated_string", "conditional_expression",
        "continue_statement", "decorated_definition", "decorator", "default_parameter",
        "delete_statement", "dictionary", "dictionary_comprehension", "dictionary_splat",
        "dictionary_splat_pattern", "dotted_name", "elif_clause", "ellipsis", "else_clause",
        "escape_interpolation", "escape_sequence", "except_clause", "expression_list",
        "expression_statement", "false", "finally_clause", "float", "for_in_clause", "for_statement",
        "format_specifier", "function_definition", "future_import_statement",
        "generator_expression", "generic_type", "global_statement", "identifier", "if_clause",
        "if_statement", "import_from_statement", "import_prefix", "import_statement", "integer",
        "interpolation", "keyword_argument", "keyword_separator", "lambda", "lambda_parameters",
        "line_continuation", "list", "list_comprehension", "list_splat", "list_splat_pattern",
        "module", "named_expression", "none", "nonlocal_statement", "not_operator", "pair",
        "parameters", "parenthesized_expression", "pass_statement", "pattern_list", "raise_statement",
        "relative_import", "return_statement", "set", "set_comprehension", "slice", "string",
        "string_content", "string_end", "string_start", "subscript", "true", "try_statement",
        "tuple", "tuple_pattern", "type", "type_parameter", "typed_default_parameter", "typed_parameter",
        "unary_operator", "union_type", "while_statement", "with_clause", "with_item", "with_statement",
        "yield"
    ]
    allowed_types_str = ", ".join(allowed_types)

    top_level_instruction = "Exactly one 'module' node can appear at the root. Use 'block' if nested.\n"

    prompt = (
        "Below is a list of tokens (index -> token_string) for a code snippet:\n"
        f"{token_info}\n\n"
        "Create a JSON-based AST with these fields:\n"
        f"- 'type': must be in {{{allowed_types_str}}}\n"
        "- 'start_token', 'end_token'\n"
        "- 'children' (array)\n\n"
        "Leaf nodes => start_token == end_token.\n"
        "No overlapping sibling token ranges.\n"
        "Return valid JSON only.\n"
    )
    if top_level:
        prompt += "\nAt the root, use 'module'. Do not nest multiple 'module'.\n" + top_level_instruction
    else:
        prompt += "\nInside blocks, do not produce 'module'. Use 'block' or suitable type.\n" + top_level_instruction

    try:
        llm_output = get_llm_answers(
            prompt,
            model_name="gpt-4o",
            require_json=True,
            temperature=0
        )
        ast_dict = json.loads(llm_output)
        return ast_dict

    except Exception as e:
        print(f"[Error] llm_build_ast_from_tokens: {e}")
        return {
            "type": "ErrorNode",
            "start_token": -1,
            "end_token": -1,
            "children": []
        }

def generate_llm_ast(code: str) -> Dict[str, Any]:
    """
    直接生成整个文件的 AST
    """
    global_tokens = tokenize_code_with_lines(code)
    tokens_for_llm = [(so, eo, tk) for (so, eo, tk, ln) in global_tokens]
    return llm_build_ast_from_tokens(tokens_for_llm, top_level=True)

###############################################################################
#     3) 单文件处理: 生成LLM AST, 存JSON                                       #
###############################################################################
def process_single_file(file_path: str):
    """
    1) read code
    2) LLM AST
    3) 保存 JSON
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()
    except Exception as e:
        print(f"[Error reading {file_path}]: {e}")
        return

    llm_ast = generate_llm_ast(code)
    llm_dir = "llm_ast_whole"
    os.makedirs(llm_dir, exist_ok=True)
    llm_path = os.path.join(llm_dir, os.path.basename(file_path) + ".json")
    with open(llm_path, "w", encoding="utf-8") as fout:
        json.dump(llm_ast, fout, indent=4, ensure_ascii=False)
    print(f"[LLM AST saved] => {llm_path}")

###############################################################################
#                            4) main() 并行处理                               #
###############################################################################
def main():
    source_dir = "../dataset/python"  # 修改为你的实际源文件目录
    if not os.path.isdir(source_dir):
        print(f"[Error] Directory {source_dir} does not exist.")
        return

    # 收集所有 .py 文件
    files = [f for f in os.listdir(source_dir) if f.endswith(".py")][:200]
    print(f"Found {len(files)} Python files in {source_dir}.")

    with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files), desc="Processing files")
        for fname in files:
            full_path = os.path.join(source_dir, fname)
            future = executor.submit(process_single_file, full_path)
            future.add_done_callback(lambda _: pbar.update(1))
            futures.append(future)
        concurrent.futures.wait(futures)
        pbar.close()

if __name__ == "__main__":
    main()

In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import re
from typing import Any, Dict, List, Tuple



###############################################################################
#                   1) 全局分词，含行号 -> global_tokens                       #
###############################################################################
def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'  # 标识符
        r'[0-9]+|'        # 数字
        r'"[^"]*"|'       # 双引号字符串
        r"'[^']*'|"       # 单引号字符串
        r'\\[ntr]|'       # 转义符 \n \t \r
        r'//.*|'          # 单行注释 (如 C++/Java/JS 风格)
        r'/\*.*?\*/|'     # 多行注释 (如 C 风格)
        r'\n|\r|\t|'      # 换行/回车/制表符
        r'\S'             # 其他符号(如 +, -, {, }, 以及任何其它非空白字符)
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

###############################################################################
# 1) 根据全局 tokens 填充 label
###############################################################################
def fill_ast_labels(ast_node: dict, code: str, global_tokens: List[Tuple[int,int,str,int]]) -> None:
    """
    把节点的 (start_token, end_token) 当作【token下标】，
    去 global_tokens 里拿对应的字符 offset，再到 code 中截取。
    存到 ast_node["label"]。
    """
    # 检查当前节点是否为字典类型
    if not isinstance(ast_node, dict):
        return
    
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)

    snippet = ""
    if (
        0 <= st <= et
        and st < len(global_tokens)
        and et < len(global_tokens)
    ):
        start_offset = global_tokens[st][0]
        end_offset   = global_tokens[et][1]
        if 0 <= start_offset < end_offset <= len(code):
            snippet = code[start_offset:end_offset]

    ast_node["label"] = snippet

    for child in ast_node.get("children", []):
        # 确保子节点是字典类型再递归处理
        if isinstance(child, dict):
            fill_ast_labels(child, code, global_tokens)
        else:
            print(f"Warning: Invalid child node type {type(child)} in {ast_node.get('type')}")


def safe_flatten_function_placeholders(node: dict) -> dict:
    """
    新建节点，避免循环引用。
    处理前检查节点是否为字典类型。
    """
    # 处理非字典节点（如意外类型）
    if not isinstance(node, dict):
        return {}
    
    node_type = node.get("type", "")
    original_children = node.get("children", [])

    # 递归处理子节点，仅处理字典类型
    flattened_children = []
    for ch in original_children:
        if isinstance(ch, dict):
            processed = safe_flatten_function_placeholders(ch)
            flattened_children.append(processed)
        else:
            print(f"Warning: Skipping invalid child type {type(ch)} in {node_type}")
    
    # 构建新节点
    new_node = {key: val for key, val in node.items() if key != "children"}
    new_node["children"] = flattened_children

    # 检查占位符结构
    if node_type in ("function_placeholder", "class_placeholder"):
        if len(flattened_children) == 1 and flattened_children[0].get("type") == "module":
            mod_node = flattened_children[0]
            mod_kids = mod_node.get("children", [])
            if len(mod_kids) == 1:
                real_node = mod_kids[0]
                # 复制字段
                for field in ("name", "start_line", "end_line", "start_token", "end_token", "label"):
                    if field in new_node:
                        real_node[field] = new_node[field]
                return real_node

    return new_node


###############################################################################
# 3) 单文件处理 => 根据同名py文件+json => 生成 global_tokens => 填label => 扁平化
###############################################################################
def process_ast_json(
    input_json_path: str,
    output_json_path: str,
    py_source_dir: str
):
    """
    预期:
      input_json_path = "llm_ast/chunk_block/1.py.json"
      -> 对应 py_file = "py_source_dir/1.py"

    假设 JSON 结构如下:
    {
      "type": "module",
      "start_token": 0,
      "end_token": 307,
      ...
      "children": [...]
    }
    或者更复杂, 但只要 "type"、"start_token"/"end_token"、"children" 就可以

    We'll:
      1) 找到同名的 .py => 读 code
      2) tokenize_code_with_lines(code) => global_tokens
      3) fill_ast_labels(ast_root, code, global_tokens)
      4) safe_flatten_function_placeholders(ast_root)
      5) json.dump()
    """
    base = os.path.basename(input_json_path)  # "1.py.json"
    # 拆分 => "1.py" + ".json"
    # 如果你命名方式不同, 需自行改
    # 这里假设 input_json_path 的文件名是 "<something>.py.json"
    # => python_source = "<something>.py"
    if base.endswith(".py.json"):
        py_file_name = base[:-5]  # remove ".json"
    else:
        # fallback
        py_file_name = base

    py_full_path = os.path.join(py_source_dir, py_file_name)

    if not os.path.isfile(py_full_path):
        print(f"[Warning] No corresponding .py found for {input_json_path}, skip label fill.")
        code = ""
        global_tokens = []
    else:
        # 读取 .py 源码
        with open(py_full_path, "r", encoding="utf-8") as fpy:
            code = fpy.read()
        # 分词
        global_tokens = tokenize_code_with_lines(code)

    # 读取 JSON AST
    try:
        with open(input_json_path, "r", encoding="utf-8") as fin:
            ast_data = json.load(fin)
    except Exception as e:
        print(f"[Error reading {input_json_path}]: {e}")
        return

    # fill label
    fill_ast_labels(ast_data, code, global_tokens)

    # flatten
    flattened_ast = safe_flatten_function_placeholders(ast_data)

    # 写出
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, "w", encoding="utf-8") as fout:
        json.dump(flattened_ast, fout, indent=4, ensure_ascii=False)

    print(f"[Processed] => {output_json_path}")


###############################################################################
# 4) main: 遍历 input_json_dir => 对应 .py => output
###############################################################################
def main():
    input_json_dir = "llm_ast_whole"              # 你的 AST json 目录
    output_json_dir = "llm_ast_whole_processed"   # 输出目录
    py_source_dir = "../dataset/python"              # 对应的 .py 文件目录

    if not os.path.isdir(input_json_dir):
        print(f"[Error] input dir {input_json_dir} not found.")
        return
    if not os.path.isdir(py_source_dir):
        print(f"[Warning] python source dir {py_source_dir} not found. Label fill will be empty.")

    os.makedirs(output_json_dir, exist_ok=True)

    # 遍历
    for fname in os.listdir(input_json_dir):
        if not fname.endswith(".json"):
            continue

        in_path = os.path.join(input_json_dir, fname)
        out_path = os.path.join(output_json_dir, fname)

        process_ast_json(in_path, out_path, py_source_dir)

if __name__ == "__main__":
    main()


[Processed] => llm_ast_whole_processed/48.py.json
[Processed] => llm_ast_whole_processed/174.py.json
[Processed] => llm_ast_whole_processed/195.py.json
[Processed] => llm_ast_whole_processed/84.py.json
[Processed] => llm_ast_whole_processed/131.py.json
[Processed] => llm_ast_whole_processed/45.py.json
[Processed] => llm_ast_whole_processed/165.py.json
[Processed] => llm_ast_whole_processed/137.py.json
[Processed] => llm_ast_whole_processed/178.py.json
[Processed] => llm_ast_whole_processed/189.py.json
[Processed] => llm_ast_whole_processed/1.py.json
[Processed] => llm_ast_whole_processed/99.py.json
[Processed] => llm_ast_whole_processed/21.py.json
[Processed] => llm_ast_whole_processed/41.py.json
[Processed] => llm_ast_whole_processed/22.py.json
[Processed] => llm_ast_whole_processed/40.py.json
[Processed] => llm_ast_whole_processed/11.py.json
[Processed] => llm_ast_whole_processed/114.py.json
[Processed] => llm_ast_whole_processed/208.py.json
[Processed] => llm_ast_whole_processed/9.p

In [17]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import glob
import json

###############################################################################
# 覆盖率比较逻辑 (与你已有的保持一致)
###############################################################################
def collect_node_ranges(ast_node: dict, only_leaves: bool=False) -> set:
    result = set()
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)
    children = ast_node.get("children", [])
    is_leaf = (len(children) == 0)

    if (not only_leaves) or (only_leaves and is_leaf):
        if st >= 0 and et >= 0:
            result.add((st, et))

    for child in children:
        result.update(collect_node_ranges(child, only_leaves))
    return result

def compare_ast_just_tokenRanges(ast_static: dict, ast_llm: dict, only_leaves: bool=False):
    """
    不考虑父节点, 只看 (start_token, end_token) 一致即可。
    可以只统计叶子(only_leaves=True)或全部节点。
    """
    static_set = collect_node_ranges(ast_static, only_leaves)
    llm_set = collect_node_ranges(ast_llm, only_leaves)
    inter = static_set.intersection(llm_set)

    total_static = len(static_set)
    total_llm = len(llm_set)
    matched = len(inter)

    cov_static = (matched / total_static) if total_static else 0.0
    cov_llm = (matched / total_llm) if total_llm else 0.0

    return {
        "total_static": total_static,
        "total_llm": total_llm,
        "matched": matched,
        "coverage_static": cov_static,
        "coverage_llm": cov_llm
    }

def collect_node_ancestors(ast_node: dict, ancestors: list, only_leaves: bool=False) -> dict:
    results = {}
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)
    children = ast_node.get("children", [])
    node_is_leaf = (len(children)==0)

    if (not only_leaves) or (only_leaves and node_is_leaf):
        if st>=0 and et>=0:
            results[(st, et)] = {
                "ancestors": set(ancestors),
                "is_leaf": node_is_leaf
            }

    new_ancestors = ancestors[:]
    if st>=0 and et>=0:
        new_ancestors.append((st,et))

    for child in children:
        submap = collect_node_ancestors(child, new_ancestors, only_leaves)
        for k,v in submap.items():
            results[k] = v

    return results

def build_static_parent_map(ast_node: dict, parent: dict, store: dict, only_leaves: bool=False):
    st = ast_node.get("start_token",-1)
    et = ast_node.get("end_token",-1)
    children = ast_node.get("children",[])
    node_is_leaf = (len(children)==0)

    if parent:
        pst = parent.get("start_token",-1)
        pet = parent.get("end_token",-1)
    else:
        pst, pet = -1, -1

    if (not only_leaves) or (only_leaves and node_is_leaf):
        if st>=0 and et>=0:
            store[(st,et)] = (pst,pet)

    for ch in children:
        build_static_parent_map(ch, ast_node, store, only_leaves)

def compare_ast_flexible_parent(ast_static: dict, ast_llm: dict, only_leaves: bool=False):
    static_map = {}
    build_static_parent_map(ast_static, None, static_map, only_leaves)
    llm_map = collect_node_ancestors(ast_llm, [], only_leaves)

    matched = 0
    total_static = len(static_map)
    total_llm = len(llm_map)

    for (node_st,node_et), (pst,pet) in static_map.items():
        if (node_st,node_et) in llm_map:
            ancset = llm_map[(node_st,node_et)]["ancestors"]
            if (pst,pet)==(-1,-1):
                matched += 1
            else:
                # 对每个祖先节点范围进行宽松匹配
                for anc_st,anc_et in ancset:
                    # 允许父节点范围有小幅度偏差(比如注释导致的token偏移)
                    if abs(anc_st - pst) <= 3 and abs(anc_et - pet) <= 3:
                        matched += 1
                        break

    cov_static = (matched / total_static) if total_static>0 else 0
    cov_llm = (matched / total_llm) if total_llm>0 else 0

    return {
        "total_static": total_static,
        "total_llm": total_llm,
        "matched": matched,
        "coverage_static": cov_static,
        "coverage_llm": cov_llm
    }


###############################################################################
# 2) main: 行数分类 + 覆盖率统计
###############################################################################
def main():
    # Python 源文件目录
    source_dir = "../dataset/python"
    # LLM AST 目录
    llm_json_dir = "./llm_ast_whole_processed"
    # 静态 AST 目录
    static_json_dir = "../dataset/python_ast"

    if not os.path.isdir(source_dir):
        print(f"[Error] Source dir not found: {source_dir}")
        return

    # 定义行数范围
    categories = [
        ("0-50", 0, 50),
        ("51-200", 51, 200),
        ("201-9999999", 201, 9999999)
    ]

    # 为每个分类，存储 4种统计
    # np_all, np_leaf, fp_all, fp_leaf
    # 每种统计包括 total_static_sum, total_llm_sum, matched_sum, file_count
    stats_map = {}
    for cat_name, low, high in categories:
        stats_map[cat_name] = {
            "file_count": 0,
            # no-parent(all)
            "np_all_static": 0, "np_all_llm":0, "np_all_match":0,
            # no-parent(leaf)
            "np_leaf_static": 0, "np_leaf_llm":0, "np_leaf_match":0,
            # flex-parent(all)
            "fp_all_static": 0, "fp_all_llm":0, "fp_all_match":0,
            # flex-parent(leaf)
            "fp_leaf_static": 0, "fp_leaf_llm":0, "fp_leaf_match":0
        }

    # 遍历 python 源文件 => line_count => cat
    for fname in os.listdir(source_dir):
        if not fname.endswith(".py"):
            continue
        py_path = os.path.join(source_dir, fname)

        # 读取行数
        try:
            with open(py_path,"r",encoding="utf-8") as f:
                lines = f.readlines()
            line_count = len(lines)
        except Exception as e:
            print(f"[Error reading {py_path}]: {e}")
            continue

        # 判断分类
        cat_name = None
        for (cname, low, high) in categories:
            if low <= line_count <= high:
                cat_name = cname
                break
        if not cat_name:
            continue  # 不在任何区间

        # 找对应 json => <fname> + ".json"
        # e.g. "1.py" => "1.py.json"
        json_name = fname + ".json"
        llm_file = os.path.join(llm_json_dir, json_name)
        static_file = os.path.join(static_json_dir, json_name)

        if not os.path.exists(llm_file) or not os.path.exists(static_file):
            # print(f"[Warn] Missing AST for {fname}, skip.")
            continue

        # 读取 JSON
        try:
            with open(llm_file,"r",encoding="utf-8") as f1:
                ast_llm = json.load(f1)
            with open(static_file,"r",encoding="utf-8") as f2:
                ast_static = json.load(f2)
        except Exception as e:
            print(f"[Error reading AST for {fname}]: {e}")
            continue

        # 计算4种覆盖率
        # no-parent(all)
        r_np_all = compare_ast_just_tokenRanges(ast_static, ast_llm, only_leaves=False)
        # no-parent(leaf)
        r_np_leaf = compare_ast_just_tokenRanges(ast_static, ast_llm, only_leaves=True)
        # flex-parent(all)
        r_fp_all = compare_ast_flexible_parent(ast_static, ast_llm, only_leaves=False)
        # flex-parent(leaf)
        r_fp_leaf = compare_ast_flexible_parent(ast_static, ast_llm, only_leaves=True)

        # 累加到 stats_map[cat_name]
        sdat = stats_map[cat_name]
        sdat["file_count"] += 1

        # np_all
        sdat["np_all_static"] += r_np_all["total_static"]
        sdat["np_all_llm"] += r_np_all["total_llm"]
        sdat["np_all_match"] += r_np_all["matched"]

        # np_leaf
        sdat["np_leaf_static"] += r_np_leaf["total_static"]
        sdat["np_leaf_llm"] += r_np_leaf["total_llm"]
        sdat["np_leaf_match"] += r_np_leaf["matched"]

        # fp_all
        sdat["fp_all_static"] += r_fp_all["total_static"]
        sdat["fp_all_llm"] += r_fp_all["total_llm"]
        sdat["fp_all_match"] += r_fp_all["matched"]

        # fp_leaf
        sdat["fp_leaf_static"] += r_fp_leaf["total_static"]
        sdat["fp_leaf_llm"] += r_fp_leaf["total_llm"]
        sdat["fp_leaf_match"] += r_fp_leaf["matched"]

    # 输出每个分类结果
    print("\n=== 按行数分类的覆盖率统计 ===")
    for (cat_name, low, high) in categories:
        sdat = stats_map[cat_name]
        fcount = sdat["file_count"]
        if fcount==0:
            print(f"\nCategory: {cat_name} ({low}~{high}) => 无文件。")
            continue

        print(f"\nCategory: {cat_name} ({low}~{high} 行), file_count={fcount}")

        # np_all
        a_s = sdat["np_all_static"]
        a_l = sdat["np_all_llm"]
        a_m = sdat["np_all_match"]
        cov_s = (a_m/a_s)*100 if a_s>0 else 0
        cov_l = (a_m/a_l)*100 if a_l>0 else 0
        print("[NoParent-All] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (a_s,a_l,a_m,cov_s,cov_l))

        # np_leaf
        b_s = sdat["np_leaf_static"]
        b_l = sdat["np_leaf_llm"]
        b_m = sdat["np_leaf_match"]
        cov_s = (b_m/b_s)*100 if b_s>0 else 0
        cov_l = (b_m/b_l)*100 if b_l>0 else 0
        print("[NoParent-Leaf] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (b_s,b_l,b_m,cov_s,cov_l))

        # fp_all
        c_s = sdat["fp_all_static"]
        c_l = sdat["fp_all_llm"]
        c_m = sdat["fp_all_match"]
        cov_s = (c_m/c_s)*100 if c_s>0 else 0
        cov_l = (c_m/c_l)*100 if c_l>0 else 0
        print("[FlexParent-All] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (c_s,c_l,c_m,cov_s,cov_l))

        # fp_leaf
        d_s = sdat["fp_leaf_static"]
        d_l = sdat["fp_leaf_llm"]
        d_m = sdat["fp_leaf_match"]
        cov_s = (d_m/d_s)*100 if d_s>0 else 0
        cov_l = (d_m/d_l)*100 if d_l>0 else 0
        print("[FlexParent-Leaf] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (d_s,d_l,d_m,cov_s,cov_l))


if __name__=="__main__":
    main()



=== 按行数分类的覆盖率统计 ===

Category: 0-50 (0~50 行), file_count=53
[NoParent-All] static=5769, llm=3946, matched=2808 => Cov(st)=48.67%, Cov(llm)=71.16%
[NoParent-Leaf] static=3077, llm=2141, matched=1492 => Cov(st)=48.49%, Cov(llm)=69.69%
[FlexParent-All] static=5769, llm=3946, matched=2357 => Cov(st)=40.86%, Cov(llm)=59.73%
[FlexParent-Leaf] static=3077, llm=2141, matched=1344 => Cov(st)=43.68%, Cov(llm)=62.77%

Category: 51-200 (51~200 行), file_count=109
[NoParent-All] static=39860, llm=9368, matched=5545 => Cov(st)=13.91%, Cov(llm)=59.19%
[NoParent-Leaf] static=21757, llm=5185, matched=2338 => Cov(st)=10.75%, Cov(llm)=45.09%
[FlexParent-All] static=39860, llm=9368, matched=4003 => Cov(st)=10.04%, Cov(llm)=42.73%
[FlexParent-Leaf] static=21757, llm=5185, matched=1904 => Cov(st)=8.75%, Cov(llm)=36.72%

Category: 201-9999999 (201~9999999 行), file_count=38
[NoParent-All] static=50143, llm=3542, matched=1991 => Cov(st)=3.97%, Cov(llm)=56.21%
[NoParent-Leaf] static=27422, llm=2300, matched=620