In [5]:
import os
import json

def is_leaf(node: dict) -> bool:
    """判断一个节点是否是叶子节点：children为空 或 不存在."""
    children = node.get("children", [])
    return len(children) == 0

def compare_ast_simplified(node1: dict, node2: dict) -> float:
    """
    对比两棵AST的相似度(范围 0~1)，策略如下：
      1) 若两者均为叶子，则比较 label;
      2) 若两者均为内部节点，则比较 type 与对应子节点;
      3) 若一者是叶子，另一者是内部节点，视为不相似 (0分)。
    """

    if not node1 or not node2:
        # 只要有一个是空，就视为 0 相似
        return 0.0

    c1 = node1.get("children", [])
    c2 = node2.get("children", [])
    leaf1 = is_leaf(node1)
    leaf2 = is_leaf(node2)

    # 1) 均为叶子 => 比 label
    if leaf1 and leaf2:
        label1 = (node1.get("label", "") or "").strip()
        label2 = (node2.get("label", "") or "").strip()
        # 这里示例：label 完全相同 => 1.0 分，否则 0.0 分
        # 也可做模糊对比，如大小写忽略等
        return 1.0 if label1 == label2 and label1 != "" else 0.0

    # 2) 若均为内部节点 => 比 type + children
    elif (not leaf1) and (not leaf2):
        type1 = node1.get("type", "")
        type2 = node2.get("type", "")

        # 类型相同 => 高分; 不同 => 低分
        type_score = 1.0 if type1 == type2 and type1 != "" else 0.0

        # 对应位置的子节点两两对比，并做平均
        len1, len2 = len(c1), len(c2)
        min_len = min(len1, len2)

        if min_len == 0:
            # 都是内部节点，但一方无子 => 仅 type_score
            return type_score * 0.5

        children_score_sum = 0.0
        for i in range(min_len):
            children_score_sum += compare_ast_simplified(c1[i], c2[i])

        # 如果一棵树有更多子节点，则可以视为额外惩罚
        # 这里示例： 多出的每个子节点 => -0.2 分(累积)
        leftover = abs(len1 - len2)
        leftover_penalty = leftover * 0.2

        # 计算平均分
        avg_children_score = children_score_sum / min_len

        # 结合 type_score + children_score
        # 示例：type 占 40%，子节点占 60%
        total = 0.4 * type_score + 0.6 * avg_children_score

        # 施加惩罚
        total -= leftover_penalty
        if total < 0:
            total = 0.0
        if total > 1:
            total = 1.0

        return total
    else:
        # 一个是叶子，另一个是内部节点 => 结构不匹配
        return 0.0


def main():
    

    llm_ast_dir = "./llm_ast/gpt-4o"
    static_ast_dir = "./static_ast"
    # 遍历 llm_ast_dir 中所有文件，找到对应 static_ast_dir 下的同名文件
    llm_files = os.listdir(llm_ast_dir)
    for filename in llm_files:
        llm_ast_path = os.path.join(llm_ast_dir, filename)
        static_ast_path = os.path.join(static_ast_dir, filename)

        if not os.path.exists(static_ast_path):
            print(f"[Warning] No corresponding static AST for {filename} in {static_ast_dir}")
            continue

        # 读取 AST
        with open(llm_ast_path, "r", encoding="utf-8") as f1:
            llm_ast = json.load(f1)
        with open(static_ast_path, "r", encoding="utf-8") as f2:
            static_ast = json.load(f2)

        # 对比
        score = compare_ast_simplified(llm_ast, static_ast)
        print(f"Comparing {filename} => similarity = {score:.2f}")

if __name__ == "__main__":
    main()


Comparing 174.py.json => similarity = 0.00
Comparing 195.py.json => similarity = 0.00
Comparing 11.py.json => similarity = 0.00
Comparing 208.py.json => similarity = 0.30
Comparing 9.py.json => similarity = 0.47
Comparing 13.py.json => similarity = 0.00
Comparing 12.py.json => similarity = 0.00
Comparing 54.py.json => similarity = 0.00
Comparing 60.py.json => similarity = 0.00
Comparing 184.py.json => similarity = 0.00
Comparing 120.py.json => similarity = 0.00
Comparing 139.py.json => similarity = 0.00
Comparing 6.py.json => similarity = 0.00
Comparing 167.py.json => similarity = 0.54
Comparing 107.py.json => similarity = 0.00
Comparing 201.py.json => similarity = 0.00
Comparing 202.py.json => similarity = 0.75
Comparing 138.py.json => similarity = 0.00
Comparing 59.py.json => similarity = 0.00
Comparing 163.py.json => similarity = 0.00
Comparing 129.py.json => similarity = 0.00
Comparing 148.py.json => similarity = 0.00
Comparing 98.py.json => similarity = 0.00
Comparing 180.py.json 