In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import glob
import json

###############################################################################
# 覆盖率比较逻辑 (与你已有的保持一致)
###############################################################################
def collect_node_ranges(ast_node: dict, only_leaves: bool=False) -> set:
    result = set()
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)
    children = ast_node.get("children", [])
    is_leaf = (len(children) == 0)

    if (not only_leaves) or (only_leaves and is_leaf):
        if st >= 0 and et >= 0:
            result.add((st, et))

    for child in children:
        result.update(collect_node_ranges(child, only_leaves))
    return result

def compare_ast_just_tokenRanges(ast_static: dict, ast_llm: dict, only_leaves: bool=False):
    """
    不考虑父节点, 只看 (start_token, end_token) 一致即可。
    可以只统计叶子(only_leaves=True)或全部节点。
    """
    static_set = collect_node_ranges(ast_static, only_leaves)
    llm_set = collect_node_ranges(ast_llm, only_leaves)
    inter = static_set.intersection(llm_set)

    total_static = len(static_set)
    total_llm = len(llm_set)
    matched = len(inter)

    cov_static = (matched / total_static) if total_static else 0.0
    cov_llm = (matched / total_llm) if total_llm else 0.0

    return {
        "total_static": total_static,
        "total_llm": total_llm,
        "matched": matched,
        "coverage_static": cov_static,
        "coverage_llm": cov_llm
    }

def collect_node_ancestors(ast_node: dict, ancestors: list, only_leaves: bool=False) -> dict:
    results = {}
    st = ast_node.get("start_token", -1)
    et = ast_node.get("end_token", -1)
    children = ast_node.get("children", [])
    node_is_leaf = (len(children)==0)

    if (not only_leaves) or (only_leaves and node_is_leaf):
        if st>=0 and et>=0:
            results[(st, et)] = {
                "ancestors": set(ancestors),
                "is_leaf": node_is_leaf
            }

    new_ancestors = ancestors[:]
    if st>=0 and et>=0:
        new_ancestors.append((st,et))

    for child in children:
        submap = collect_node_ancestors(child, new_ancestors, only_leaves)
        for k,v in submap.items():
            results[k] = v

    return results

def build_static_parent_map(ast_node: dict, parent: dict, store: dict, only_leaves: bool=False):
    st = ast_node.get("start_token",-1)
    et = ast_node.get("end_token",-1)
    children = ast_node.get("children",[])
    node_is_leaf = (len(children)==0)

    if parent:
        pst = parent.get("start_token",-1)
        pet = parent.get("end_token",-1)
    else:
        pst, pet = -1, -1

    if (not only_leaves) or (only_leaves and node_is_leaf):
        if st>=0 and et>=0:
            store[(st,et)] = (pst,pet)

    for ch in children:
        build_static_parent_map(ch, ast_node, store, only_leaves)

def compare_ast_flexible_parent(ast_static: dict, ast_llm: dict, only_leaves: bool=False):
    static_map = {}
    build_static_parent_map(ast_static, None, static_map, only_leaves)
    llm_map = collect_node_ancestors(ast_llm, [], only_leaves)

    matched = 0
    total_static = len(static_map)
    total_llm = len(llm_map)

    for (node_st,node_et), (pst,pet) in static_map.items():
        if (node_st,node_et) in llm_map:
            ancset = llm_map[(node_st,node_et)]["ancestors"]
            if (pst,pet)==(-1,-1):
                matched += 1
            else:
                if (pst,pet) in ancset:
                    matched += 1

    cov_static = (matched / total_static) if total_static>0 else 0
    cov_llm = (matched / total_llm) if total_llm>0 else 0

    return {
        "total_static": total_static,
        "total_llm": total_llm,
        "matched": matched,
        "coverage_static": cov_static,
        "coverage_llm": cov_llm
    }


###############################################################################
# 2) main: 行数分类 + 覆盖率统计
###############################################################################
def main():
    # Python 源文件目录
    source_dir = "../../dataset/ts"
    # LLM AST 目录
    llm_json_dir = "./llm_ast/chunk_block_processed"
    # 静态 AST 目录
    static_json_dir = "../../dataset/ts_ast"

    if not os.path.isdir(source_dir):
        print(f"[Error] Source dir not found: {source_dir}")
        return

    # 定义行数范围
    categories = [
        # ("all", 0, 9999999),
        ("0-100", 0, 100),
        ("101-200", 101, 200),
        ("201-300", 201, 300),
        ("301-9999999", 301, 9999999)
    ]

    # 为每个分类，存储 4种统计
    # np_all, np_leaf, fp_all, fp_leaf
    # 每种统计包括 total_static_sum, total_llm_sum, matched_sum, file_count
    stats_map = {}
    for cat_name, low, high in categories:
        stats_map[cat_name] = {
            "file_count": 0,
            # no-parent(all)
            "np_all_static": 0, "np_all_llm":0, "np_all_match":0,
            # no-parent(leaf)
            "np_leaf_static": 0, "np_leaf_llm":0, "np_leaf_match":0,
            # flex-parent(all)
            "fp_all_static": 0, "fp_all_llm":0, "fp_all_match":0,
            # flex-parent(leaf)
            "fp_leaf_static": 0, "fp_leaf_llm":0, "fp_leaf_match":0
        }

    # 遍历 cangjie 源文件 => line_count => cat
    for fname in os.listdir(source_dir):
        if not fname.endswith(".ts"):
            continue
        py_path = os.path.join(source_dir, fname)

        # 读取行数
        try:
            with open(py_path,"r",encoding="utf-8") as f:
                lines = f.readlines()
            line_count = len(lines)
        except Exception as e:
            print(f"[Error reading {py_path}]: {e}")
            continue

        # 判断分类
        cat_name = None
        for (cname, low, high) in categories:
            if low <= line_count <= high:
                cat_name = cname
                break
        if not cat_name:
            continue  # 不在任何区间

        # 找对应 json => <fname> + ".json"
        # e.g. "1.cj" => "1.cj.json"
        json_name = fname + ".json"
        llm_file = os.path.join(llm_json_dir, json_name)
        static_file = os.path.join(static_json_dir, json_name).replace(".ts", "")

        if not os.path.exists(llm_file) or not os.path.exists(static_file):
            # print(f"[Warn] Missing AST for {fname}, skip.")
            continue

        # 读取 JSON
        try:
            with open(llm_file,"r",encoding="utf-8") as f1:
                ast_llm = json.load(f1)
            with open(static_file,"r",encoding="utf-8") as f2:
                ast_static = json.load(f2)
        except Exception as e:
            print(f"[Error reading AST for {fname}]: {e}")
            continue

        # 计算4种覆盖率
        # no-parent(all)
        r_np_all = compare_ast_just_tokenRanges(ast_static, ast_llm, only_leaves=False)
        # no-parent(leaf)
        r_np_leaf = compare_ast_just_tokenRanges(ast_static, ast_llm, only_leaves=True)
        # flex-parent(all)
        r_fp_all = compare_ast_flexible_parent(ast_static, ast_llm, only_leaves=False)
        # flex-parent(leaf)
        r_fp_leaf = compare_ast_flexible_parent(ast_static, ast_llm, only_leaves=True)

        # 累加到 stats_map[cat_name]
        sdat = stats_map[cat_name]
        sdat["file_count"] += 1

        # np_all
        sdat["np_all_static"] += r_np_all["total_static"]
        sdat["np_all_llm"] += r_np_all["total_llm"]
        sdat["np_all_match"] += r_np_all["matched"]

        # np_leaf
        sdat["np_leaf_static"] += r_np_leaf["total_static"]
        sdat["np_leaf_llm"] += r_np_leaf["total_llm"]
        sdat["np_leaf_match"] += r_np_leaf["matched"]

        # fp_all
        sdat["fp_all_static"] += r_fp_all["total_static"]
        sdat["fp_all_llm"] += r_fp_all["total_llm"]
        sdat["fp_all_match"] += r_fp_all["matched"]

        # fp_leaf
        sdat["fp_leaf_static"] += r_fp_leaf["total_static"]
        sdat["fp_leaf_llm"] += r_fp_leaf["total_llm"]
        sdat["fp_leaf_match"] += r_fp_leaf["matched"]

    # 输出每个分类结果
    print("\n=== 按行数分类的覆盖率统计 ===")
    for (cat_name, low, high) in categories:
        sdat = stats_map[cat_name]
        fcount = sdat["file_count"]
        if fcount==0:
            print(f"\nCategory: {cat_name} ({low}~{high}) => 无文件。")
            continue

        print(f"\nCategory: {cat_name} ({low}~{high} 行), file_count={fcount}")

        # np_all
        a_s = sdat["np_all_static"]
        a_l = sdat["np_all_llm"]
        a_m = sdat["np_all_match"]
        cov_s = (a_m/a_s)*100 if a_s>0 else 0
        cov_l = (a_m/a_l)*100 if a_l>0 else 0
        print("[NoParent-All] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (a_s,a_l,a_m,cov_s,cov_l))

        # np_leaf
        b_s = sdat["np_leaf_static"]
        b_l = sdat["np_leaf_llm"]
        b_m = sdat["np_leaf_match"]
        cov_s = (b_m/b_s)*100 if b_s>0 else 0
        cov_l = (b_m/b_l)*100 if b_l>0 else 0
        print("[NoParent-Leaf] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (b_s,b_l,b_m,cov_s,cov_l))

        # fp_all
        c_s = sdat["fp_all_static"]
        c_l = sdat["fp_all_llm"]
        c_m = sdat["fp_all_match"]
        cov_s = (c_m/c_s)*100 if c_s>0 else 0
        cov_l = (c_m/c_l)*100 if c_l>0 else 0
        print("[FlexParent-All] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (c_s,c_l,c_m,cov_s,cov_l))

        # fp_leaf
        d_s = sdat["fp_leaf_static"]
        d_l = sdat["fp_leaf_llm"]
        d_m = sdat["fp_leaf_match"]
        cov_s = (d_m/d_s)*100 if d_s>0 else 0
        cov_l = (d_m/d_l)*100 if d_l>0 else 0
        print("[FlexParent-Leaf] static=%d, llm=%d, matched=%d => Cov(st)=%.2f%%, Cov(llm)=%.2f%%" % (d_s,d_l,d_m,cov_s,cov_l))


if __name__=="__main__":
    main()



=== 按行数分类的覆盖率统计 ===

Category: 0-100 (0~100 行), file_count=96
[NoParent-All] static=13162, llm=14107, matched=10250 => Cov(st)=77.88%, Cov(llm)=72.66%
[NoParent-Leaf] static=7376, llm=8609, matched=6135 => Cov(st)=83.18%, Cov(llm)=71.26%
[FlexParent-All] static=13162, llm=14107, matched=7606 => Cov(st)=57.79%, Cov(llm)=53.92%
[FlexParent-Leaf] static=7376, llm=8609, matched=4424 => Cov(st)=59.98%, Cov(llm)=51.39%

Category: 101-200 (101~200 行), file_count=24
[NoParent-All] static=13642, llm=12611, matched=9220 => Cov(st)=67.59%, Cov(llm)=73.11%
[NoParent-Leaf] static=7755, llm=7674, matched=5439 => Cov(st)=70.14%, Cov(llm)=70.88%
[FlexParent-All] static=13642, llm=12611, matched=7092 => Cov(st)=51.99%, Cov(llm)=56.24%
[FlexParent-Leaf] static=7755, llm=7674, matched=4057 => Cov(st)=52.31%, Cov(llm)=52.87%

Category: 201-300 (201~300 行), file_count=1
[NoParent-All] static=1689, llm=1234, matched=918 => Cov(st)=54.35%, Cov(llm)=74.39%
[NoParent-Leaf] static=895, llm=716, matched=523 => 