In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
用法示例：
  python visualize_ast.py ast.json ast.pdf

该脚本会读取指定的 JSON AST（包含字段：type, label, children 等），
并使用 Graphviz 生成可视化的 PDF 文件。
"""

import os
import sys
import json
import uuid
import warnings
from graphviz import Digraph

def sanitize_label(text: str) -> str:
    """
    对可能导致 Graphviz 解析异常的字符做转义或替换。
    可根据需要自行添加/修改规则。
    """
    # 1) 先把反斜杠本身转义成 \\，以免它干扰其他转义
    text = text.replace('\\', '\\\\')
    # 2) 把双引号 " 转义成 \"
    text = text.replace('"', '\\"')
    # 3) 移除或替换三引号（如果确实有的话）
    text = text.replace("'''", "'")
    text = text.replace('"""', '"')
    # 4) 把换行符替换为 \\n 或者直接空格
    text = text.replace('\n', '\\n')
    # 5) 如果担心 '3d' 之类会出问题，可以进一步处理
    #    但要看实际情况，这里仅作为示例注释：
    # import re
    # text = re.sub(r'(\d+)d', r'\1 d', text)

    return text

def traverse_ast(dot: Digraph, node: dict, parent_id: str = None):
    """
    DFS遍历AST，用Graphviz构建有向图。
    
    :param dot:       Graphviz Digraph 对象
    :param node:      当前AST节点(dict: {type, label, children})
    :param parent_id: 父节点的ID(若无则为 None)
    """
    if not node:
        return

    try:
        # 生成唯一ID，以保证不同节点不会重复
        current_id = str(uuid.uuid4())

        # 提取 type / label，清理后再拼接
        node_type_raw = node.get("type", "")
        node_label_raw = node.get("label", "")

        # 对 type / label 分别做转义
        node_type_clean = sanitize_label(node_type_raw)
        node_label_clean = sanitize_label(node_label_raw)

        # 若 label 过长可做截断，这里只是演示
        max_len = 50
        if len(node_label_clean) > max_len:
            node_label_clean = node_label_clean[:max_len - 3] + "..."

        # 拼接为最终标签：type + 换行 + label
        label_text = f"{node_type_clean}\\n{node_label_clean}"

        # 创建节点
        dot.node(current_id, label_text, shape='box')

        # 连接父子
        if parent_id is not None:
            dot.edge(parent_id, current_id)

        # 递归处理子节点
        for child in node.get("children", []):
            traverse_ast(dot, child, current_id)
            
    except Exception as e:
        print(f"[Error] Failed to traverse node: {e}")
        return

def visualize_ast(input_json: str, output_pdf: str):
    """
    读取 JSON 文件并渲染为 PDF
    """
    # 读取 JSON AST
    if '4.py.json' in input_json:
        print(1)
    if os.path.exists(output_pdf + ".pdf"):
        return
    if not os.path.exists(input_json):
        print(f"[Error] 文件不存在: {input_json}")
        return
    try:
        with open(input_json, "r", encoding="utf-8") as f:
            ast_data = json.load(f)
    except Exception as e:
        print(f"[Error] 读取文件失败: {input_json}, 原因: {e}")
        sys.exit(1)

    try:
        # 初始化 Graphviz Digraph
        dot = Digraph(comment='AST', format='pdf')
        
        # 设置图形属性以控制布局
        dot.attr(rankdir='TB')      # 从上到下布局
        dot.attr(ranksep='0.6')     # 减小层级间距
        dot.attr(nodesep='0.4')     # 减小同层节点间距
        dot.attr(size='8,11.7')     # 设置A4纸大致尺寸
        # dot.attr(ratio='compress') # 若想尝试自动压缩可取消注释
        dot.attr(dpi='150')         # DPI越低，整体生成图可能越小

        # DFS 构建图
        traverse_ast(dot, ast_data, parent_id=None)

        # 渲染输出 PDF
        # 这里指定 cleanup=True，会把中间生成的 .gv 等文件清理掉，只保留最终 PDF
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            out_path = dot.render(output_pdf, view=False, cleanup=True)
        print(f"[OK] PDF 输出成功 => {out_path}")
    except Exception as e:
        print(f"[Error] 生成可视化失败: {e}")
        sys.exit(1)

def main():
    """
    这是一个示例的批处理逻辑：
    1. 从 llm_ast_dir 中读取每个 .json 文件，输出到 visualize_llm_dir。
    2. 从 static_ast_dir 中读取对应的 .json 文件，输出到 visualize_static_dir。
    """
    llm_ast_dir = "./llm_ast/cfg_block"
    static_ast_dir = "../../dataset/python_ast"

    visualize_static_dir = "./visualize/static"
    visualize_llm_dir = "./visualize/llm"
    os.makedirs(visualize_static_dir, exist_ok=True)
    os.makedirs(visualize_llm_dir, exist_ok=True)

    for i in range(200):
        file = f"{i}.py.json"
        if not file.endswith('.json'):
            continue
        
        # 1) 处理 LLM AST
        json_file = os.path.join(llm_ast_dir, file)
        output_pdf = os.path.join(visualize_llm_dir, file.replace('.json', ''))
        visualize_ast(json_file, output_pdf)

        # 2) 处理静态 AST
        json_file = os.path.join(static_ast_dir, file)
        output_pdf = os.path.join(visualize_static_dir, file.replace('.json', ''))
        visualize_ast(json_file, output_pdf)

if __name__ == "__main__":
    # 如果你是在 IPython / Jupyter 下执行脚本，有时会出现:
    # "UserWarning: To exit: use 'exit', 'quit', or Ctrl-D."
    # 可以通过下面的方式忽略此类警告：
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message=r".*To exit: use 'exit', 'quit', or Ctrl-D.*")
        main()

    # 主流程完成后强制退出，避免在 IPython 环境下出现提示
    sys.exit(0)


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
[Error] 文件不存在: ./llm_ast/cfg_block/97.py.json
[Error] 文件不存在: ../../dataset/python_ast/97.py.json
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
import os
import json

def is_leaf(node: dict) -> bool:
    """判断一个节点是否是叶子节点：children为空 或 不存在."""
    children = node.get("children", [])
    return len(children) == 0

def compare_ast_simplified(node1: dict, node2: dict) -> float:
    """
    对比两棵AST的相似度(范围 0~1)，策略如下：
      1) 若两者均为叶子，则比较 label;
      2) 若两者均为内部节点，则比较 type 与对应子节点;
      3) 若一者是叶子，另一者是内部节点，视为不相似 (0分)。
    """

    if not node1 or not node2:
        # 只要有一个是空，就视为 0 相似
        return 0.0

    c1 = node1.get("children", [])
    c2 = node2.get("children", [])
    leaf1 = is_leaf(node1)
    leaf2 = is_leaf(node2)

    # 1) 均为叶子 => 比 label
    if leaf1 and leaf2:
        label1 = (node1.get("label", "") or "").strip()
        label2 = (node2.get("label", "") or "").strip()
        # 这里示例：label 完全相同 => 1.0 分，否则 0.0 分
        # 也可做模糊对比，如大小写忽略等
        return 1.0 if label1 == label2 and label1 != "" else 0.0

    # 2) 若均为内部节点 => 比 type + children
    elif (not leaf1) and (not leaf2):
        type1 = node1.get("type", "")
        type2 = node2.get("type", "")

        # 类型相同 => 高分; 不同 => 低分
        type_score = 1.0 if type1 == type2 and type1 != "" else 0.0

        # 对应位置的子节点两两对比，并做平均
        len1, len2 = len(c1), len(c2)
        min_len = min(len1, len2)

        if min_len == 0:
            # 都是内部节点，但一方无子 => 仅 type_score
            return type_score * 0.5

        children_score_sum = 0.0
        for i in range(min_len):
            children_score_sum += compare_ast_simplified(c1[i], c2[i])

        # 如果一棵树有更多子节点，则可以视为额外惩罚
        # 这里示例： 多出的每个子节点 => -0.2 分(累积)
        leftover = abs(len1 - len2)
        leftover_penalty = leftover * 0.2

        # 计算平均分
        avg_children_score = children_score_sum / min_len

        # 结合 type_score + children_score
        # 示例：type 占 40%，子节点占 60%
        total = 0.4 * type_score + 0.6 * avg_children_score

        # 施加惩罚
        total -= leftover_penalty
        if total < 0:
            total = 0.0
        if total > 1:
            total = 1.0

        return total
    else:
        # 一个是叶子，另一个是内部节点 => 结构不匹配
        return 0.0


def main():
    

    llm_ast_dir = "./llm_ast/gpt-4o"
    static_ast_dir = "./static_ast"
    # 遍历 llm_ast_dir 中所有文件，找到对应 static_ast_dir 下的同名文件
    llm_files = os.listdir(llm_ast_dir)
    for filename in llm_files:
        llm_ast_path = os.path.join(llm_ast_dir, filename)
        static_ast_path = os.path.join(static_ast_dir, filename)

        if not os.path.exists(static_ast_path):
            print(f"[Warning] No corresponding static AST for {filename} in {static_ast_dir}")
            continue

        # 读取 AST
        with open(llm_ast_path, "r", encoding="utf-8") as f1:
            llm_ast = json.load(f1)
        with open(static_ast_path, "r", encoding="utf-8") as f2:
            static_ast = json.load(f2)

        # 对比
        score = compare_ast_simplified(llm_ast, static_ast)
        print(f"Comparing {filename} => similarity = {score:.2f}")

if __name__ == "__main__":
    main()


Comparing 174.py.json => similarity = 0.00
Comparing 195.py.json => similarity = 0.00
Comparing 11.py.json => similarity = 0.00
Comparing 208.py.json => similarity = 0.30
Comparing 9.py.json => similarity = 0.47
Comparing 13.py.json => similarity = 0.00
Comparing 12.py.json => similarity = 0.00
Comparing 54.py.json => similarity = 0.00
Comparing 60.py.json => similarity = 0.00
Comparing 184.py.json => similarity = 0.00
Comparing 120.py.json => similarity = 0.00
Comparing 139.py.json => similarity = 0.00
Comparing 6.py.json => similarity = 0.00
Comparing 167.py.json => similarity = 0.54
Comparing 107.py.json => similarity = 0.00
Comparing 201.py.json => similarity = 0.00
Comparing 202.py.json => similarity = 0.75
Comparing 138.py.json => similarity = 0.00
Comparing 59.py.json => similarity = 0.00
Comparing 163.py.json => similarity = 0.00
Comparing 129.py.json => similarity = 0.00
Comparing 148.py.json => similarity = 0.00
Comparing 98.py.json => similarity = 0.00
Comparing 180.py.json 