In [2]:
"""
Multi-threaded script for Python code analysis:

1) LLM-based AST generation (via CFG + partial block approach):
   - parse_cfg_structure() => get line ranges of classes/functions
   - build_ast_from_cfg() => recursively exclude child function/class lines from the parent block,
     parse only the remaining lines, then insert function/class placeholders back in the correct position.
   - Each node has global-level start_token/end_token, thanks to a single global tokenize_code_with_lines().

2) Tree-sitter-based static AST:
   - generate_tree_sitter_ast() => returns {type, label, children}.

3) Compare snippet-level labels (optional).

4) Save both ASTs as JSON.

5) Multi-file parallel processing with ThreadPoolExecutor.

See the "llm_build_ast_from_tokens" function's prompt – we keep it intact as requested.
"""

import os
import sys
import json
from typing import Any, Dict, List, Tuple
import re
import concurrent.futures
from multiprocessing import cpu_count

from tqdm import tqdm

###############################################################################
#                           LLM interface (stub)                              #
###############################################################################
"""
Replace 'get_llm_answers' with your actual LLM API call or function.
Here we just provide a stub or minimal placeholder.
"""
try:
    from llm import get_llm_answers
except ImportError:
    # If no llm.py found, define a placeholder
    def get_llm_answers(prompt, model_name="", require_json=False, temperature=0):
        # Return a minimal JSON for demonstration
        # In reality, you'd implement the actual LLM call (OpenAI, local model, etc.)
        return "{}"


###############################################################################
#                     Tree-sitter initialization (Python)                     #
###############################################################################
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

# Assume tree_sitter_python.language() can return a .so / .dylib path
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)

###############################################################################
#                   1) 全局分词，含行号 -> global_tokens                       #
###############################################################################
def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'  # 标识符
        r'[0-9]+|'        # 数字
        r'"[^"]*"|'       # 双引号字符串
        r"'[^']*'|"       # 单引号字符串
        r'\\[ntr]|'       # 转义符 \n \t \r
        r'//.*|'          # 单行注释 (如 C++/Java/JS 风格)
        r'/\*.*?\*/|'     # 多行注释 (如 C 风格)
        r'\n|\r|\t|'      # 换行/回车/制表符
        r'\S'             # 其他符号(如 +, -, {, }, 以及任何其它非空白字符)
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

###############################################################################
#                      2) 获取 CFG (class/function) 行范围                    #
###############################################################################
def get_structure_prompt(code_text: str) -> str:
    """
    构造提示给 LLM，让其解析出脚本中的类/函数行范围。
    """
    lines = code_text.splitlines()
    lines_json = [{"line": i+1, "code": line} for i, line in enumerate(lines)]

    prompt = f"""
You are given Python code lines (with line numbers). Identify all top-level and nested classes/functions,
and return a JSON with this structure:

{{
  "name": "example_script",
  "type": "CFG",
  "start_line": 1,
  "end_line": {len(lines)},
  "functions": [
    {{
      "name": "function_name",
      "type": "function",
      "start_line": 10,
      "end_line": 20,
      "functions": [],
      "classes": []
    }}
  ],
  "classes": [
    {{
      "name": "class_name",
      "type": "class",
      "start_line": 30,
      "end_line": 40,
      "functions": [...],
      "classes": [...]
    }}
  ]
}}

Do not omit or rename fields.
Here is the code lines:
{json.dumps(lines_json, indent=2)}

Return valid JSON only.
"""
    return prompt

def parse_cfg_structure(code_text: str) -> Dict[str,Any]:
    """
    调用LLM，获取CFG JSON结构
    """
    prompt = get_structure_prompt(code_text)
    try:
        raw = get_llm_answers(prompt, model_name="gpt-4o", require_json=True, temperature=0)
        return json.loads(raw)
    except Exception as e:
        print(f"[Error] parse_cfg_structure: {e}")
        lines = code_text.split('\n')
        return {
            "type": "CFG",
            "name": "fallback",
            "start_line": 1,
            "end_line": len(lines),
            "functions": [],
            "classes": []
        }

###############################################################################
#        3) 在 block 内排除子函数/类行, 用 LLM 局部解析 => remap 索引          #
###############################################################################
def filter_tokens_for_block(
    global_tokens: List[Tuple[int,int,str,int]],
    block_start_line: int,
    block_end_line: int,
    excluded_line_ranges: List[Tuple[int,int]]
) -> Tuple[List[Tuple[int,int,str,int]], List[int]]:
    """
    在 [block_start_line..block_end_line] 内，排除excluded_line_ranges，
    返回局部token列表 local_tokens 及其到全局的 mapping。
    """
    def in_excluded(line_no: int) -> bool:
        for (ex_st, ex_end) in excluded_line_ranges:
            if ex_st <= line_no <= ex_end:
                return True
        return False

    filtered = []
    mapping = []
    for global_idx, (so, eo, tk, ln) in enumerate(global_tokens):
        if ln < block_start_line or ln > block_end_line:
            continue
        if in_excluded(ln):
            continue
        filtered.append((so, eo, tk, ln))
        mapping.append(global_idx)
    return filtered, mapping

def remap_ast_local_to_global(ast_node: Dict[str,Any], mapping: List[int]) -> None:
    """
    递归把局部下标 (start_token/end_token) => 全局下标
    mapping[ local_idx ] = global_idx
    """
    st_local = ast_node.get("start_token", -1)
    et_local = ast_node.get("end_token", -1)

    if 0 <= st_local < len(mapping):
        ast_node["start_token"] = mapping[st_local]
    else:
        ast_node["start_token"] = -1

    if 0 <= et_local < len(mapping):
        ast_node["end_token"] = mapping[et_local]
    else:
        ast_node["end_token"] = -1

    for c in ast_node.get("children", []):
        remap_ast_local_to_global(c, mapping)


###############################################################################
#   4) 保持原样: llm_build_ast_from_tokens() (prompt 不变, 勿改)              #
###############################################################################
def llm_build_ast_from_tokens(tokens_with_offset: List[Tuple[int, int, str]], top_level=True) -> Dict[str, Any]:
    """
    给定 tokens列表 => 调用 LLM => 生成 JSON AST.
    - top_level: 是否最外层(只有最外层允许 'module'), 否则用 'block' 等
    """
    indexed_tokens = [(i, t[2]) for i, t in enumerate(tokens_with_offset)]
    token_info = "\n".join(f"{i}: {text}" for (i, text) in indexed_tokens)

    # 构造 prompt
    allowed_types = [
        "aliased_import", "argument_list", "as_pattern", "as_pattern_target", "assert_statement",
        "assignment", "attribute", "augmented_assignment", "await", "binary_operator", "block",
        "boolean_operator", "break_statement", "call", "class_definition", "comment",
        "comparison_operator", "concatenated_string", "conditional_expression",
        "continue_statement", "decorated_definition", "decorator", "default_parameter",
        "delete_statement", "dictionary", "dictionary_comprehension", "dictionary_splat",
        "dictionary_splat_pattern", "dotted_name", "elif_clause", "ellipsis", "else_clause",
        "escape_interpolation", "escape_sequence", "except_clause", "expression_list",
        "expression_statement", "false", "finally_clause", "float", "for_in_clause", "for_statement",
        "format_specifier", "function_definition", "future_import_statement",
        "generator_expression", "generic_type", "global_statement", "identifier", "if_clause",
        "if_statement", "import_from_statement", "import_prefix", "import_statement", "integer",
        "interpolation", "keyword_argument", "keyword_separator", "lambda", "lambda_parameters",
        "line_continuation", "list", "list_comprehension", "list_splat", "list_splat_pattern",
        "module", "named_expression", "none", "nonlocal_statement", "not_operator", "pair",
        "parameters", "parenthesized_expression", "pass_statement", "pattern_list", "raise_statement",
        "relative_import", "return_statement", "set", "set_comprehension", "slice", "string",
        "string_content", "string_end", "string_start", "subscript", "true", "try_statement",
        "tuple", "tuple_pattern", "type", "type_parameter", "typed_default_parameter", "typed_parameter",
        "unary_operator", "union_type", "while_statement", "with_clause", "with_item", "with_statement",
        "yield"
    ]
    allowed_types_str = ", ".join(allowed_types)

    top_level_instruction = "Exactly one 'module' node can appear at the root. Use 'block' if nested.\n"

    prompt = (
        "Below is a list of tokens (index -> token_string) for a code snippet:\n"
        f"{token_info}\n\n"
        "Create a JSON-based AST with these fields:\n"
        f"- 'type': must be in {{{allowed_types_str}}}\n"
        "- 'start_token', 'end_token'\n"
        "- 'children' (array)\n\n"
        "Leaf nodes => start_token == end_token.\n"
        "No overlapping sibling token ranges.\n"
        "Return valid JSON only.\n"
    )
    if top_level:
        prompt += "\nAt the root, use 'module'. Do not nest multiple 'module'.\n" + top_level_instruction
    else:
        prompt += "\nInside blocks, do not produce 'module'. Use 'block' or suitable type.\n" + top_level_instruction

    try:
        llm_output = get_llm_answers(
            prompt,
            model_name="gpt-4o",
            require_json=True,
            temperature=0
        )
        ast_dict = json.loads(llm_output)
        return ast_dict

    except Exception as e:
        print(f"[Error] llm_build_ast_from_tokens: {e}")
        return {
            "type": "ErrorNode",
            "start_token": -1,
            "end_token": -1,
            "children": []
        }

def llm_parse_block_ast(local_tokens: List[Tuple[int,int,str,int]], top_level=True) -> Dict[str,Any]:
    """
    使用 llm_build_ast_from_tokens 来解析局部token，得到局部AST(下标0..N-1)。
    """
    # tokens_with_offset 只需要 (start_offset, end_offset, text)，忽略行号
    tokens_for_llm = [(so, eo, tk) for (so, eo, tk, ln) in local_tokens]
    return llm_build_ast_from_tokens(tokens_for_llm, top_level=top_level)


###############################################################################
#    5) 递归构建AST: 处理本块普通语句 + 子函数/类 => 插占位 => merge & sort      #
###############################################################################
def find_first_token_index(global_tokens: List[Tuple[int,int,str,int]], line_start: int) -> int:
    """
    找到在全局tokens里，行号 >= line_start 的第一个token的索引
    """
    for i,(s_off,e_off,tk,ln) in enumerate(global_tokens):
        if ln >= line_start:
            return i
    return -1

def find_last_token_index(global_tokens: List[Tuple[int,int,str,int]], line_end: int) -> int:
    """
    找到在全局tokens里，行号 <= line_end 的最后一个token的索引
    """
    idx = -1
    for i,(s_off,e_off,tk,ln) in enumerate(global_tokens):
        if ln <= line_end:
            idx = i
        else:
            break
    return idx

def build_ast_from_cfg(
    cfg_node: Dict[str,Any],
    global_tokens: List[Tuple[int,int,str,int]],
    top_level=True
) -> Dict[str,Any]:
    """
    递归构建 AST:
      1) 在 [start_line..end_line] 排除 functions/classes 行
      2) 解析剩余 => block_ast_local
      3) remap => 全局索引
      4) 对每个子函数/类递归 build => 占位节点 => 插入
      5) 按 start_token 排序 => 返回
    """
    st_line = cfg_node.get("start_line", 1)
    ed_line = cfg_node.get("end_line", 1)

    # 收集排除行
    excluded_line_ranges = []
    for fn in cfg_node.get("functions", []):
        excluded_line_ranges.append((fn["start_line"], fn["end_line"]))
    for cl in cfg_node.get("classes", []):
        excluded_line_ranges.append((cl["start_line"], cl["end_line"]))

    # 1) filter
    local_tokens, mapping = filter_tokens_for_block(global_tokens, st_line, ed_line, excluded_line_ranges)
    # 2) LLM解析 => block_ast_local
    block_ast_local = llm_parse_block_ast(local_tokens, top_level=top_level)
    # 3) remap
    remap_ast_local_to_global(block_ast_local, mapping)

    if "children" not in block_ast_local:
        block_ast_local["children"] = []

    # 4) 处理子函数/类 => 占位
    placeholders = []
    for fn_cfg in cfg_node.get("functions", []):
        fn_ast = build_ast_from_cfg(fn_cfg, global_tokens, top_level=False)
        placeholders.append({
            "type": "function_placeholder",
            "name": fn_cfg["name"],
            "start_line": fn_cfg["start_line"],
            "end_line": fn_cfg["end_line"],
            "start_token": find_first_token_index(global_tokens, fn_cfg["start_line"]),
            "end_token": find_last_token_index(global_tokens, fn_cfg["end_line"]),
            "children": [fn_ast]
        })
    for cl_cfg in cfg_node.get("classes", []):
        cl_ast = build_ast_from_cfg(cl_cfg, global_tokens, top_level=False)
        placeholders.append({
            "type": "class_placeholder",
            "name": cl_cfg["name"],
            "start_line": cl_cfg["start_line"],
            "end_line": cl_cfg["end_line"],
            "start_token": find_first_token_index(global_tokens, cl_cfg["start_line"]),
            "end_token": find_last_token_index(global_tokens, cl_cfg["end_line"]),
            "children": [cl_ast]
        })

    merged_children = block_ast_local["children"] + placeholders
    merged_children.sort(key=lambda n: n.get("start_token", -1))
    block_ast_local["children"] = merged_children

    if top_level:
        block_ast_local["type"] = "module"

    return block_ast_local

def generate_llm_ast_via_cfg(code: str) -> Dict[str,Any]:
    """
    最终对外函数: 
    1) 全局分词 => global_tokens
    2) parse_cfg_structure => cfg
    3) build_ast_from_cfg => AST(全局下标)
    """
    global_tokens = tokenize_code_with_lines(code)
    cfg_root = parse_cfg_structure(code)
    llm_ast = build_ast_from_cfg(cfg_root, global_tokens, top_level=True)
    return llm_ast

###############################################################################
#               6) Tree-sitter 静态AST (简化: 不含 token idx)                #
###############################################################################
class PyTreeSitterStaticHandler:
    def __init__(self):
        self.parser = parser

    def generate_static_ast(self, code: str) -> Dict[str, Any]:
        tree = self.parser.parse(code.encode("utf-8"))
        root_node = tree.root_node
        return self.ts_node_to_dict(root_node)

    def ts_node_to_dict(self, node) -> Dict[str, Any]:
        if not node.is_named:
            return None
        node_type = node.type
        node_text = (node.text or b"").decode("utf-8")
        custom = {
            "type": node_type,
            "label": node_text,
            "children": []
        }
        for i in range(node.child_count):
            child = node.child(i)
            sub = self.ts_node_to_dict(child)
            if sub:
                custom["children"].append(sub)
        return custom

def generate_tree_sitter_ast(code: str) -> Dict[str, Any]:
    handler = PyTreeSitterStaticHandler()
    return handler.generate_static_ast(code)

###############################################################################
#                    7) 可选对比：compare_ast_nodes()                         #
###############################################################################
def compare_ast_nodes(node1: Dict[str, Any], node2: Dict[str, Any], path: str = ""):
    """
    简单对比 label 及 children 数量的示例
    """
    if not node1 or not node2:
        return
    label1 = (node1.get("label","") or "").strip()
    label2 = (node2.get("label","") or "").strip()
    if label1 != label2:
        print(f"[Diff] label mismatch at {path}")
        print(f"  1: {repr(label1)}")
        print(f"  2: {repr(label2)}")

    c1 = node1.get("children", [])
    c2 = node2.get("children", [])
    if len(c1) != len(c2):
        print(f"[Diff] child count mismatch at {path}: {len(c1)} vs {len(c2)}")

    for i in range(min(len(c1), len(c2))):
        compare_ast_nodes(c1[i], c2[i], path + f".children[{i}]")

###############################################################################
#     8) 单文件处理: 生成LLM AST, Tree-sitter AST, 存JSON,可选对比             #
###############################################################################
def process_llm_ast(code: str, file_path: str) -> Dict[str,Any]:
    """
    生成 LLM AST, 并保存到 JSON
    """
    llm_dir = "llm_ast/chunk_block"
    os.makedirs(llm_dir, exist_ok=True)
    llm_path = os.path.join(llm_dir, os.path.basename(file_path) + ".json")

    if os.path.exists(llm_path):
        with open(llm_path, "r", encoding="utf-8") as f:
            llm_ast = json.load(f)
        # print(f"[LLM AST cached] => {llm_path}")
        return llm_ast

    llm_ast = generate_llm_ast_via_cfg(code)
    with open(llm_path, "w", encoding="utf-8") as fout:
        json.dump(llm_ast, fout, indent=4, ensure_ascii=False)
    # print(f"[LLM AST] => {llm_path}")
    return llm_ast

def process_static_ast(code: str, file_path: str) -> Dict[str,Any]:
    """
    生成 tree-sitter AST 并保存到 JSON
    """
    ts_dir = "static_ast"
    os.makedirs(ts_dir, exist_ok=True)
    ts_path = os.path.join(ts_dir, os.path.basename(file_path) + ".json")

    ts_ast = generate_tree_sitter_ast(code)
    with open(ts_path, "w", encoding="utf-8") as fout:
        json.dump(ts_ast, fout, indent=4, ensure_ascii=False)
    print(f"[TS AST] => {ts_path}")
    return ts_ast

def process_single_file(file_path: str):
    """
    1) read code
    2) LLM AST
    3) Tree-sitter AST
    4) optionally compare
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()
    except Exception as e:
        print(f"[Error reading {file_path}]: {e}")
        return

    # print(f"[Processing] {file_path}")
    llm_ast = process_llm_ast(code, file_path)
    # ts_ast = process_static_ast(code, file_path)
    # (可选) compare_ast_nodes(llm_ast, ts_ast)
    # print(f"[Done] {file_path}")

###############################################################################
#                            9) main() 并行处理                               #
###############################################################################
def main():
    source_dir = "../../dataset/python"  # 修改为你的实际源文件目录
    if not os.path.isdir(source_dir):
        print(f"[Error] Directory {source_dir} does not exist.")
        return

    # 收集所有 .py 文件
    files = [f for f in os.listdir(source_dir) if f.endswith(".py")][:200]
    print(f"Found {len(files)} Python files in {source_dir}.")

    with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files), desc="Processing files")
        for fname in files:
            full_path = os.path.join(source_dir, fname)
            future = executor.submit(process_single_file, full_path)
            future.add_done_callback(lambda _: pbar.update(1))
            futures.append(future)
        concurrent.futures.wait(futures)
        pbar.close()

if __name__ == "__main__":
    main()

Found 200 Python files in ../../dataset/python.


Processing files:   4%|▍         | 8/200 [00:00<00:02, 64.04it/s]

[LLM AST cached] => llm_ast/chunk_block/195.py.json
[LLM AST cached] => llm_ast/chunk_block/180.py.json
[LLM AST cached] => llm_ast/chunk_block/208.py.json
[LLM AST cached] => llm_ast/chunk_block/11.py.json
[LLM AST cached] => llm_ast/chunk_block/202.py.json
[LLM AST cached] => llm_ast/chunk_block/13.py.json
[LLM AST cached] => llm_ast/chunk_block/98.py.json
[LLM AST cached] => llm_ast/chunk_block/14.py.json
[LLM AST cached] => llm_ast/chunk_block/71.py.json


Processing files:   8%|▊         | 17/200 [00:00<00:02, 75.35it/s]

[LLM AST cached] => llm_ast/chunk_block/59.py.json
[LLM AST cached] => llm_ast/chunk_block/176.py.json
[LLM AST cached] => llm_ast/chunk_block/9.py.json
[LLM AST cached] => llm_ast/chunk_block/62.py.json
[LLM AST cached] => llm_ast/chunk_block/6.py.json
[LLM AST cached] => llm_ast/chunk_block/167.py.json
[LLM AST cached] => llm_ast/chunk_block/107.py.json
[LLM AST cached] => llm_ast/chunk_block/139.py.json


Processing files:  22%|██▏       | 43/200 [00:00<00:01, 152.68it/s]

[LLM AST cached] => llm_ast/chunk_block/54.py.json
[LLM AST cached] => llm_ast/chunk_block/184.py.json
[LLM AST cached] => llm_ast/chunk_block/129.py.json
[LLM AST cached] => llm_ast/chunk_block/174.py.json
[LLM AST cached] => llm_ast/chunk_block/201.py.json
[LLM AST cached] => llm_ast/chunk_block/138.py.json
[LLM AST cached] => llm_ast/chunk_block/160.py.json
[LLM AST cached] => llm_ast/chunk_block/55.py.json
[LLM AST cached] => llm_ast/chunk_block/159.py.json
[LLM AST cached] => llm_ast/chunk_block/148.py.json
[LLM AST cached] => llm_ast/chunk_block/163.py.json
[LLM AST cached] => llm_ast/chunk_block/75.py.json
[LLM AST cached] => llm_ast/chunk_block/100.py.json
[LLM AST cached] => llm_ast/chunk_block/29.py.json
[LLM AST cached] => llm_ast/chunk_block/45.py.json
[LLM AST cached] => llm_ast/chunk_block/25.py.json
[LLM AST cached] => llm_ast/chunk_block/116.py.json
[LLM AST cached] => llm_ast/chunk_block/99.py.json
[LLM AST cached] => llm_ast/chunk_block/89.py.json
[LLM AST cached] => 

Processing files:  28%|██▊       | 57/200 [00:00<00:00, 152.68it/s]

[LLM AST cached] => llm_ast/chunk_block/102.py.json
[LLM AST cached] => llm_ast/chunk_block/82.py.json
[LLM AST cached] => llm_ast/chunk_block/183.py.json
[LLM AST cached] => llm_ast/chunk_block/93.py.json
[LLM AST cached] => llm_ast/chunk_block/121.py.json
[LLM AST cached] => llm_ast/chunk_block/112.py.json
[LLM AST cached] => llm_ast/chunk_block/210.py.json
[LLM AST cached] => llm_ast/chunk_block/110.py.json
[LLM AST cached] => llm_ast/chunk_block/91.py.json
[LLM AST cached] => llm_ast/chunk_block/70.py.json
[LLM AST cached] => llm_ast/chunk_block/74.py.json
[LLM AST cached] => llm_ast/chunk_block/34.py.json
[LLM AST cached] => llm_ast/chunk_block/125.py.json
[LLM AST cached] => llm_ast/chunk_block/72.py.json


Processing files:  34%|███▍      | 69/200 [00:00<00:00, 152.68it/s]

[LLM AST cached] => llm_ast/chunk_block/103.py.json
[LLM AST cached] => llm_ast/chunk_block/44.py.json
[LLM AST cached] => llm_ast/chunk_block/84.py.json
[LLM AST cached] => llm_ast/chunk_block/58.py.json
[LLM AST cached] => llm_ast/chunk_block/37.py.json
[LLM AST cached] => llm_ast/chunk_block/171.py.json
[LLM AST cached] => llm_ast/chunk_block/83.py.json
[LLM AST cached] => llm_ast/chunk_block/136.py.json
[LLM AST cached] => llm_ast/chunk_block/196.py.json
[LLM AST cached] => llm_ast/chunk_block/30.py.json
[LLM AST cached] => llm_ast/chunk_block/78.py.json
[LLM AST cached] => llm_ast/chunk_block/113.py.json
[LLM AST cached] => llm_ast/chunk_block/172.py.json
[LLM AST cached] => llm_ast/chunk_block/40.py.json
[LLM AST cached] => llm_ast/chunk_block/164.py.json
[LLM AST cached] => llm_ast/chunk_block/47.py.json
[LLM AST cached] => llm_ast/chunk_block/43.py.json
[LLM AST cached] => llm_ast/chunk_block/158.py.json
[LLM AST cached] => llm_ast/chunk_block/126.py.json
[LLM AST cached] => ll

Processing files:  44%|████▍     | 88/200 [00:00<00:00, 152.68it/s]

[LLM AST cached] => llm_ast/chunk_block/194.py.json
[LLM AST cached] => llm_ast/chunk_block/140.py.json
[LLM AST cached] => llm_ast/chunk_block/18.py.json
[LLM AST cached] => llm_ast/chunk_block/119.py.json
[LLM AST cached] => llm_ast/chunk_block/156.py.json
[LLM AST cached] => llm_ast/chunk_block/33.py.json
[LLM AST cached] => llm_ast/chunk_block/50.py.json
[LLM AST cached] => llm_ast/chunk_block/36.py.json
[LLM AST cached] => llm_ast/chunk_block/41.py.json
[LLM AST cached] => llm_ast/chunk_block/94.py.json
[LLM AST cached] => llm_ast/chunk_block/68.py.json
[LLM AST cached] => llm_ast/chunk_block/87.py.json
[LLM AST cached] => llm_ast/chunk_block/133.py.json
[LLM AST cached] => llm_ast/chunk_block/128.py.json
[LLM AST cached] => llm_ast/chunk_block/109.py.json
[LLM AST cached] => llm_ast/chunk_block/142.py.json
[LLM AST cached] => llm_ast/chunk_block/189.py.json


Processing files:  64%|██████▍   | 128/200 [00:00<00:01, 61.78it/s]

[LLM AST cached] => llm_ast/chunk_block/96.py.json
[LLM AST cached] => llm_ast/chunk_block/170.py.json
[LLM AST cached] => llm_ast/chunk_block/203.py.json
[LLM AST cached] => llm_ast/chunk_block/145.py.json
[LLM AST cached] => llm_ast/chunk_block/79.py.json
[LLM AST cached] => llm_ast/chunk_block/42.py.json
[LLM AST cached] => llm_ast/chunk_block/181.py.json
[LLM AST cached] => llm_ast/chunk_block/209.py.json
[LLM AST cached] => llm_ast/chunk_block/26.py.json
[LLM AST cached] => llm_ast/chunk_block/115.py.json
[LLM AST cached] => llm_ast/chunk_block/165.py.json
[LLM AST cached] => llm_ast/chunk_block/199.py.json
[LLM AST cached] => llm_ast/chunk_block/51.py.json
[LLM AST cached] => llm_ast/chunk_block/81.py.json
[LLM AST cached] => llm_ast/chunk_block/67.py.json
[LLM AST cached] => llm_ast/chunk_block/2.py.json
[LLM AST cached] => llm_ast/chunk_block/1.py.json
[LLM AST cached] => llm_ast/chunk_block/63.py.json
[LLM AST cached] => llm_ast/chunk_block/193.py.json
[LLM AST cached] => llm_

Processing files:  77%|███████▋  | 154/200 [00:00<00:00, 71.81it/s]

[LLM AST cached] => llm_ast/chunk_block/69.py.json
[LLM AST cached] => llm_ast/chunk_block/152.py.json
[LLM AST cached] => llm_ast/chunk_block/92.py.json
[LLM AST cached] => llm_ast/chunk_block/66.py.json
[LLM AST cached] => llm_ast/chunk_block/149.py.json
[LLM AST cached] => llm_ast/chunk_block/64.py.json
[LLM AST cached] => llm_ast/chunk_block/188.py.json
[LLM AST cached] => llm_ast/chunk_block/175.py.json
[LLM AST cached] => llm_ast/chunk_block/190.py.json
[LLM AST cached] => llm_ast/chunk_block/80.py.json
[LLM AST cached] => llm_ast/chunk_block/15.py.json
[LLM AST cached] => llm_ast/chunk_block/20.py.json
[LLM AST cached] => llm_ast/chunk_block/101.py.json
[LLM AST cached] => llm_ast/chunk_block/182.py.json
[LLM AST cached] => llm_ast/chunk_block/56.py.json


Processing files:  88%|████████▊ | 175/200 [00:01<00:00, 44.75it/s]

[LLM AST cached] => llm_ast/chunk_block/5.py.json
[LLM AST cached] => llm_ast/chunk_block/106.py.json
[LLM AST cached] => llm_ast/chunk_block/192.py.json
[LLM AST cached] => llm_ast/chunk_block/150.py.json
[LLM AST cached] => llm_ast/chunk_block/88.py.json
[LLM AST cached] => llm_ast/chunk_block/153.py.json
[LLM AST cached] => llm_ast/chunk_block/178.py.json
[LLM AST cached] => llm_ast/chunk_block/197.py.json
[LLM AST cached] => llm_ast/chunk_block/31.py.json
[LLM AST cached] => llm_ast/chunk_block/16.py.json
[LLM AST cached] => llm_ast/chunk_block/205.py.json
[LLM AST cached] => llm_ast/chunk_block/207.py.json
[LLM AST cached] => llm_ast/chunk_block/127.py.json
[LLM AST cached] => llm_ast/chunk_block/95.py.json
[LLM AST cached] => llm_ast/chunk_block/49.py.json
[LLM AST cached] => llm_ast/chunk_block/17.py.json
[LLM AST cached] => llm_ast/chunk_block/161.py.json
[LLM AST cached] => llm_ast/chunk_block/65.py.json
[LLM AST cached] => llm_ast/chunk_block/157.py.json
[LLM AST cached] => l

Processing files:  92%|█████████▎| 185/200 [00:01<00:00, 44.75it/s]

[LLM AST cached] => llm_ast/chunk_block/130.py.json
[LLM AST cached] => llm_ast/chunk_block/191.py.json
[LLM AST cached] => llm_ast/chunk_block/135.py.json
[LLM AST cached] => llm_ast/chunk_block/137.py.json
[LLM AST cached] => llm_ast/chunk_block/162.py.json
[LLM AST cached] => llm_ast/chunk_block/124.py.json
[LLM AST cached] => llm_ast/chunk_block/179.py.json
[LLM AST cached] => llm_ast/chunk_block/173.py.json
[LLM AST cached] => llm_ast/chunk_block/61.py.json


Processing files:  93%|█████████▎| 186/200 [05:19<02:04,  8.92s/it]

[LLM AST] => llm_ast/chunk_block/60.py.json
[Error] llm_build_ast_from_tokens: <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>xiaoai.plus | 524: A timeout occurred</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block 

## 后处理

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import re
from typing import Any, Dict, List, Tuple



###############################################################################
#                   1) 全局分词，含行号 -> global_tokens                       #
###############################################################################
def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'  # 标识符
        r'[0-9]+|'        # 数字
        r'"[^"]*"|'       # 双引号字符串
        r"'[^']*'|"       # 单引号字符串
        r'\\[ntr]|'       # 转义符 \n \t \r
        r'//.*|'          # 单行注释 (如 C++/Java/JS 风格)
        r'/\*.*?\*/|'     # 多行注释 (如 C 风格)
        r'\n|\r|\t|'      # 换行/回车/制表符
        r'\S'             # 其他符号(如 +, -, {, }, 以及任何其它非空白字符)
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

###############################################################################
# 1) 根据全局 tokens 填充 label
###############################################################################
def fill_ast_labels(ast_node: dict, code: str, global_tokens: List[Tuple[int,int,str,int]]) -> None:
    """
    把节点的 (start_token, end_token) 当作【token下标】，
    去 global_tokens 里拿对应的字符 offset，再到 code 中截取。
    存到 ast_node["label"]。
    """
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)

    snippet = ""
    if (
        0 <= st <= et
        and st < len(global_tokens)
        and et < len(global_tokens)
    ):
        start_offset = global_tokens[st][0]
        end_offset   = global_tokens[et][1]
        if 0 <= start_offset < end_offset <= len(code):
            snippet = code[start_offset:end_offset]

    ast_node["label"] = snippet

    for child in ast_node.get("children", []):
        fill_ast_labels(child, code, global_tokens)


###############################################################################
# 2) 扁平化：去掉 function_placeholder->module->(唯一子节点)
###############################################################################
def safe_flatten_function_placeholders(node: dict) -> dict:
    """
    新建节点，避免循环引用。
    如果 node.type = function_placeholder/class_placeholder，
    并且只有1个child且是 'module'，
    且 'module' 有1个child => 直接返回该child 并拷贝 placeholder 的字段
    """
    if not node:
        return {}

    node_type = node.get("type", "")
    original_children = node.get("children", [])

    # 先递归处理children
    flattened_children = [safe_flatten_function_placeholders(ch) for ch in original_children]

    # 构建 new_node（复制非-children字段）
    new_node = {}
    for key, val in node.items():
        if key != "children":
            new_node[key] = val
    new_node["children"] = flattened_children

    # 检查占位符结构
    if node_type in ("function_placeholder", "class_placeholder"):
        if len(flattened_children) == 1 and flattened_children[0].get("type") == "module":
            mod_node = flattened_children[0]
            mod_kids = mod_node.get("children", [])
            if len(mod_kids) == 1:
                real_node = mod_kids[0]
                # 把 placeholder 上的一些字段复制给最里层
                for field in ("name", "start_line", "end_line", "start_token", "end_token", "label"):
                    if field in new_node:
                        real_node[field] = new_node[field]
                return real_node

    return new_node


###############################################################################
# 3) 单文件处理 => 根据同名py文件+json => 生成 global_tokens => 填label => 扁平化
###############################################################################
def process_ast_json(
    input_json_path: str,
    output_json_path: str,
    py_source_dir: str
):
    """
    预期:
      input_json_path = "llm_ast/chunk_block/1.py.json"
      -> 对应 py_file = "py_source_dir/1.py"

    假设 JSON 结构如下:
    {
      "type": "module",
      "start_token": 0,
      "end_token": 307,
      ...
      "children": [...]
    }
    或者更复杂, 但只要 "type"、"start_token"/"end_token"、"children" 就可以

    We'll:
      1) 找到同名的 .py => 读 code
      2) tokenize_code_with_lines(code) => global_tokens
      3) fill_ast_labels(ast_root, code, global_tokens)
      4) safe_flatten_function_placeholders(ast_root)
      5) json.dump()
    """
    base = os.path.basename(input_json_path)  # "1.py.json"
    # 拆分 => "1.py" + ".json"
    # 如果你命名方式不同, 需自行改
    # 这里假设 input_json_path 的文件名是 "<something>.py.json"
    # => python_source = "<something>.py"
    if base.endswith(".py.json"):
        py_file_name = base[:-5]  # remove ".json"
    else:
        # fallback
        py_file_name = base

    py_full_path = os.path.join(py_source_dir, py_file_name)

    if not os.path.isfile(py_full_path):
        print(f"[Warning] No corresponding .py found for {input_json_path}, skip label fill.")
        code = ""
        global_tokens = []
    else:
        # 读取 .py 源码
        with open(py_full_path, "r", encoding="utf-8") as fpy:
            code = fpy.read()
        # 分词
        global_tokens = tokenize_code_with_lines(code)

    # 读取 JSON AST
    try:
        with open(input_json_path, "r", encoding="utf-8") as fin:
            ast_data = json.load(fin)
    except Exception as e:
        print(f"[Error reading {input_json_path}]: {e}")
        return

    # fill label
    fill_ast_labels(ast_data, code, global_tokens)

    # flatten
    flattened_ast = safe_flatten_function_placeholders(ast_data)

    # 写出
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, "w", encoding="utf-8") as fout:
        json.dump(flattened_ast, fout, indent=4, ensure_ascii=False)

    print(f"[Processed] => {output_json_path}")


###############################################################################
# 4) main: 遍历 input_json_dir => 对应 .py => output
###############################################################################
def main():
    input_json_dir = "llm_ast/chunk_block"              # 你的 AST json 目录
    output_json_dir = "llm_ast/chunk_block_processed"   # 输出目录
    py_source_dir = "../../dataset/python"              # 对应的 .py 文件目录

    if not os.path.isdir(input_json_dir):
        print(f"[Error] input dir {input_json_dir} not found.")
        return
    if not os.path.isdir(py_source_dir):
        print(f"[Warning] python source dir {py_source_dir} not found. Label fill will be empty.")

    os.makedirs(output_json_dir, exist_ok=True)

    # 遍历
    for fname in os.listdir(input_json_dir):
        if not fname.endswith(".json"):
            continue

        in_path = os.path.join(input_json_dir, fname)
        out_path = os.path.join(output_json_dir, fname)

        process_ast_json(in_path, out_path, py_source_dir)

if __name__ == "__main__":
    main()


[Processed] => llm_ast/chunk_block_processed/48.py.json
[Processed] => llm_ast/chunk_block_processed/174.py.json
[Processed] => llm_ast/chunk_block_processed/195.py.json
[Processed] => llm_ast/chunk_block_processed/84.py.json
[Processed] => llm_ast/chunk_block_processed/131.py.json
[Processed] => llm_ast/chunk_block_processed/45.py.json
[Processed] => llm_ast/chunk_block_processed/165.py.json
[Processed] => llm_ast/chunk_block_processed/137.py.json
[Processed] => llm_ast/chunk_block_processed/178.py.json
[Processed] => llm_ast/chunk_block_processed/189.py.json
[Processed] => llm_ast/chunk_block_processed/1.py.json
[Processed] => llm_ast/chunk_block_processed/99.py.json
[Processed] => llm_ast/chunk_block_processed/41.py.json
[Processed] => llm_ast/chunk_block_processed/40.py.json
[Processed] => llm_ast/chunk_block_processed/11.py.json
[Processed] => llm_ast/chunk_block_processed/114.py.json
[Processed] => llm_ast/chunk_block_processed/208.py.json
[Processed] => llm_ast/chunk_block_proce