In [37]:
import json
import re
from typing import Dict, List, Optional, Union
from dataclasses import dataclass
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np


@dataclass
class CFGSimilarityResult:
    """存储CFG比较结果的数据类"""
    filename: str
    edge_coverage: float
    content_similarity: float
    structure_similarity: float
    matched_edges: int
    gt_edges: int
    llm_edges: int
    nested_results: Optional[Dict[str, 'CFGSimilarityResult']] = None


class CFGComparator:
    def __init__(self):
        """初始化CFG比较器"""
        pass

    @staticmethod
    def count_edges(cfg: Dict) -> int:
        """
        递归计算CFG中的边数量，包括所有嵌套结构的边
        """
        def count_edges_recursive(blocks):
            edge_count = 0
            for block in blocks:
                successors = block.get("successors", [])
                edge_count += len(successors)  # 当前 block 的边数量
                # 递归计算 successors 的边
                for successor in successors:
                    if isinstance(successor, dict):  # successor 也是一个带有 blocks 的嵌套结构
                        edge_count += count_edges_recursive(successor.get("blocks", []))
            return edge_count

        # 统计顶层 blocks 的边
        edge_count = count_edges_recursive(cfg.get("blocks", []))

        # 递归统计嵌套函数和类的边
        for func in cfg.get("functions", []):
            edge_count += CFGComparator.count_edges(func)
        for cls in cfg.get("classes", []):
            edge_count += CFGComparator.count_edges(cls)

        return edge_count

    def structure_similarity(self, llm_cfg: Dict, static_cfg: Dict) -> float:
        """
        计算两个CFG的结构相似度（基于边的数量）。
        """
        llm_edges = self.count_edges(llm_cfg)
        static_edges = self.count_edges(static_cfg)

        # 如果两者都没有边，则结构上可以视为“极度简化”——按需设为 1.0
        if llm_edges == 0 and static_edges == 0:
            return 1.0
        if llm_edges == 0 or static_edges == 0:
            return 0.0

        # 较简单的衡量：用最小/最大，值越接近1越表示边数接近
        return min(llm_edges, static_edges) / max(llm_edges, static_edges)

    def content_similarity(self, llm_cfg: Dict, static_cfg: Dict) -> float:
        """
        计算两个CFG的内容相似度（基于 blocks 的内容）。
        如果 label 是列表，则将其合并为一个字符串。
        并在向量化前检查文本是否为空。
        """
        llm_blocks = llm_cfg.get("blocks", [])
        static_blocks = static_cfg.get("blocks", [])

        # 如果两边都没有 blocks，视为内容相同
        if not llm_blocks and not static_blocks:
            return 1.0
        # 如果一边没 blocks 则视为没有可比内容
        if not llm_blocks or not static_blocks:
            return 0.0

        def unify_label(label_value: Union[str, List[str], None]) -> str:
            """将 label 统一成单一字符串。"""
            if isinstance(label_value, list):
                return "\n".join(str(item) for item in label_value)
            elif isinstance(label_value, str):
                return label_value
            else:
                return ""

        # 将 blocks 的 label 统一转换为字符串后再拼成大文本
        llm_code = "\n".join(unify_label(block.get("label", "")) for block in llm_blocks)
        static_code = "\n".join(unify_label(block.get("label", "")) for block in static_blocks)

        # 进一步清洗空白字符
        llm_code = llm_code.strip()
        static_code = static_code.strip()

        # 如果两个文本都空，视为相似度 1.0
        if not llm_code and not static_code:
            return 1.0
        # 如果只有一方空，则相似度 0.0
        if not llm_code or not static_code:
            return 0.0

        # ======== 以下为正常文本相似度计算 ========
        def custom_tokenizer(text):
            return re.findall(r'\b\w+\b', text)

        def custom_preprocessor(text):
            # 简单删除 # 开头的注释，可按需扩展
            text = re.sub(r'#.*', '', text)
            return text

        vectorizer = CountVectorizer(
            tokenizer=custom_tokenizer,
            preprocessor=custom_preprocessor,
            stop_words='english',
            ngram_range=(1, 2)
        )

        try:
            vectors = vectorizer.fit_transform([llm_code, static_code])
            similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
        except ValueError:
            # 捕获 "empty vocabulary" 异常，如果文本过于短或只含停用词，fallback 为 0.0
            similarity = 0.0

        return similarity

    def compare_cfgs(self, llm_cfg: Dict, static_cfg: Dict, name: str) -> CFGSimilarityResult:
        """递归比较两个CFG并返回相似度结果"""
        structure_sim = self.structure_similarity(llm_cfg, static_cfg)
        content_sim = self.content_similarity(llm_cfg, static_cfg)

        gt_edges = self.count_edges(static_cfg)
        llm_edges = self.count_edges(llm_cfg)

        matched_edges = int(structure_sim * min(gt_edges, llm_edges))
        edge_coverage = matched_edges / gt_edges if gt_edges > 0 else 0

        nested_results = {}

        # === 递归比较 methods (functions) ===
        llm_functions = {f["name"]: f for f in llm_cfg.get("functions", [])}
        static_functions = {f["name"]: f for f in static_cfg.get("functions", [])}
        common_functions = set(llm_functions.keys()) & set(static_functions.keys())

        for func_name in common_functions:
            nested_results[f"function_{func_name}"] = self.compare_cfgs(
                llm_functions[func_name],
                static_functions[func_name],
                func_name
            )

        # === 递归比较 classes ===
        llm_classes = {c["name"]: c for c in llm_cfg.get("classes", [])}
        static_classes = {c["name"]: c for c in static_cfg.get("classes", [])}
        common_classes = set(llm_classes.keys()) & set(static_classes.keys())

        for class_name in common_classes:
            nested_results[f"class_{class_name}"] = self.compare_cfgs(
                llm_classes[class_name],
                static_classes[class_name],
                class_name
            )

        return CFGSimilarityResult(
            filename=name,
            edge_coverage=edge_coverage,
            content_similarity=content_sim,
            structure_similarity=structure_sim,
            matched_edges=matched_edges,
            gt_edges=gt_edges,
            llm_edges=llm_edges,
            nested_results=nested_results if nested_results else None
        )


class CFGEvaluator:
    def __init__(self, llm_cfg_dir: str, static_cfg_dir: str, result_file: str):
        self.llm_cfg_dir = Path(llm_cfg_dir)
        self.static_cfg_dir = Path(static_cfg_dir)
        self.result_file = Path(result_file)
        self.comparator = CFGComparator()
        self.results: List[CFGSimilarityResult] = []

    def process_file(self, llm_cfg_path: Path) -> Optional[CFGSimilarityResult]:
        static_cfg_path = self.static_cfg_dir / llm_cfg_path.name
        if not static_cfg_path.exists():
            return None

        with open(llm_cfg_path, "r", encoding="utf-8") as f:
            llm_cfg = json.load(f)
        with open(static_cfg_path, "r", encoding="utf-8") as f:
            static_cfg = json.load(f)

        result = self.comparator.compare_cfgs(llm_cfg, static_cfg, llm_cfg_path.name)
        self.results.append(result)
        self.save_results()
        return result

    def save_results(self):
        with open(self.result_file, "w", encoding="utf-8") as f:
            json.dump(
                [self._result_to_dict(r) for r in self.results],
                f,
                indent=2,
                ensure_ascii=False
            )

    def evaluate_all(self) -> List[CFGSimilarityResult]:
        llm_cfg_paths = list(self.llm_cfg_dir.glob("*.json"))
        for llm_cfg_path in tqdm(llm_cfg_paths, desc="处理CFG文件"):
            self.process_file(llm_cfg_path)
        return self.results

    @staticmethod
    def _result_to_dict(result: CFGSimilarityResult) -> Dict:
        return {
            "filename": result.filename,
            "edge_coverage": result.edge_coverage,
            "content_similarity": result.content_similarity,
            "structure_similarity": result.structure_similarity,
            "matched_edges": result.matched_edges,
            "gt_edges": result.gt_edges,
            "llm_edges": result.llm_edges,
            "nested_results": (
                {
                    k: CFGEvaluator._result_to_dict(v)
                    for k, v in result.nested_results.items()
                }
                if result.nested_results
                else None
            )
        }


def calculate_aggregate_metrics(results: List[CFGSimilarityResult]) -> Dict[str, float]:
    """对比较结果做全局统计。"""
    if not results:
        return {
            "total_cfgs_compared": 0,
            "average_edge_coverage": 0.0,
            "average_content_similarity": 0.0,
            "average_structure_similarity": 0.0,
            "total_gt_edges": 0,
            "total_llm_edges": 0,
            "total_matched_edges": 0
        }

    metrics = {
        "total_cfgs_compared": len(results),
        "average_edge_coverage": float(np.mean([r.edge_coverage for r in results])),
        "average_content_similarity": float(np.mean([r.content_similarity for r in results])),
        "average_structure_similarity": float(np.mean([r.structure_similarity for r in results])),
        "total_gt_edges": sum(r.gt_edges for r in results),
        "total_llm_edges": sum(r.llm_edges for r in results),
        "total_matched_edges": sum(r.matched_edges for r in results)
    }
    return metrics


def main():
    evaluator = CFGEvaluator(
        llm_cfg_dir="merged_llm_cfg_with_line_no",
        static_cfg_dir="../../dataset/python_cfg",
        result_file="evaluation_results.json"
    )

    results = evaluator.evaluate_all()
    metrics = calculate_aggregate_metrics(results)

    print("\nAutomatic Evaluation Summary:")
    print(f"Total CFGs compared: {metrics['total_cfgs_compared']}")
    print(f"Average Edge Coverage: {metrics['average_edge_coverage']:.2f}")
    print(f"Average Content Similarity: {metrics['average_content_similarity']:.2f}")
    print(f"Average Structure Similarity: {metrics['average_structure_similarity']:.2f}")

    with open("evaluation_metrics.json", "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)


if __name__ == "__main__":
    main()


处理CFG文件: 100%|██████████| 190/190 [00:33<00:00,  5.62it/s]


Automatic Evaluation Summary:
Total CFGs compared: 190
Average Edge Coverage: 0.54
Average Content Similarity: 0.76
Average Structure Similarity: 0.71





In [33]:
import os
import json
from typing import Dict, List, Set, Tuple

def extract_line_based_edges(cfg_json: Dict) -> Set[Tuple[int, int]]:
    """
    从已经带 start_line/end_line 信息的 CFG JSON 中，递归提取所有边的 (source_end_line, target_start_line)。
    可以处理顶层 blocks，也会递归处理 functions、classes 中的 blocks。

    参数：
        cfg_json: 一个 CFG 的字典结构，示例:
                  {
                    "name": ...,
                    "type": "CFG",
                    "start_line": 1,
                    "end_line": 34,
                    "blocks": [...],
                    "functions": [...],
                    "classes": [...]
                  }

    返回：
        edges: 一个 set，元素为 (end_line, start_line) 元组。
               如果某 block 或其 successor 没有行号，则跳过。
    """

    edges: Set[Tuple[int, int]] = set()

    def traverse_block(block: Dict):
        """
        递归遍历单个 block 及其 successors，收集边 (source_end_line, target_start_line)。
        """
        source_end_line = block.get("end_line", None)
        successors = block.get("successors", [])

        for succ in successors:
            target_start_line = succ.get("start_line", None)
            if source_end_line is not None and target_start_line is not None:
                edges.add((source_end_line, target_start_line))
            # 递归处理后继节点本身，可能也有嵌套 successors
            traverse_block(succ)

    def traverse_cfg(cfg: Dict):
        """
        1. 遍历当前 CFG 级别的 blocks
        2. 递归处理 functions、classes
        """
        # 先处理本级别的 blocks
        for blk in cfg.get("blocks", []):
            traverse_block(blk)

        # 递归处理所有子函数
        for func_cfg in cfg.get("functions", []):
            edges_in_func = extract_line_based_edges(func_cfg)
            edges.update(edges_in_func)

        # 递归处理所有子类
        for class_cfg in cfg.get("classes", []):
            edges_in_class = extract_line_based_edges(class_cfg)
            edges.update(edges_in_class)

    traverse_cfg(cfg_json)
    return edges


def compare_cfgs_by_line_edges(llm_cfg: Dict, static_cfg: Dict) -> Dict[str, float]:
    """
    仅依赖 (source_end_line, target_start_line) 来比较两份 CFG 的边。

    返回：
    {
      "matched_edges": int,        # 匹配成功的边数
      "total_static_edges": int,   # 静态 CFG 的边数
      "total_llm_edges": int,      # LLM 生成的边数
      "edge_coverage": float       # matched_edges / total_static_edges, 若 static_edges=0 则=0
    }
    """
    llm_edges = extract_line_based_edges(llm_cfg)
    static_edges = extract_line_based_edges(static_cfg)

    matched = llm_edges.intersection(static_edges)
    coverage = 0.0
    if len(static_edges) > 0:
        coverage = len(matched) / len(static_edges)

    return {
        "matched_edges": len(matched),
        "total_static_edges": len(static_edges),
        "total_llm_edges": len(llm_edges),
        "edge_coverage": coverage
    }


def compare_all_cfgs(llm_cfg_dir: str, static_cfg_dir: str, output_file: str = "evaluation_results.json"):
    """
    批量对比两个目录下的 CFG JSON 文件（文件名相同）。只比较 (end_line, start_line)。
    并把结果保存到指定的 JSON 文件里。
    """

    # 收集所有 .json 文件（假设两边目录里文件名一一对应）
    llm_files = [f for f in os.listdir(llm_cfg_dir) if f.endswith(".json")]

    results = []
    for fname in llm_files:
        llm_path = os.path.join(llm_cfg_dir, fname)
        static_path = os.path.join(static_cfg_dir, fname)

        if not os.path.exists(static_path):
            print(f"警告：{static_path} 不存在，跳过 {fname}。")
            continue

        # 加载 CFG
        with open(llm_path, "r", encoding="utf-8") as f1:
            llm_cfg = json.load(f1)
        with open(static_path, "r", encoding="utf-8") as f2:
            static_cfg = json.load(f2)

        # 调用比较函数
        compare_result = compare_cfgs_by_line_edges(llm_cfg, static_cfg)
        compare_result["filename"] = fname  # 附加文件名信息
        results.append(compare_result)

    # 把结果写入 JSON
    with open(output_file, "w", encoding="utf-8") as out:
        json.dump(results, out, indent=2, ensure_ascii=False)

    return results


if __name__ == "__main__":
    # 使用示例
    llm_cfg_dir = "llm_cfg_with_line_no"  # LLM 生成的 CFG JSON 路径
    static_cfg_dir = "../../dataset/python_cfg"  # 静态分析生成的 CFG JSON 路径
    output_file = "evaluation_results.json"

    # 批量对比
    all_results = compare_all_cfgs(llm_cfg_dir, static_cfg_dir, output_file=output_file)
    print(all_results)

    # 可计算一个总体统计，比如平均覆盖率
    if all_results:
        avg_coverage = sum(r["edge_coverage"] for r in all_results) / len(all_results)
        print(f"处理了 {len(all_results)} 个文件，平均覆盖率：{avg_coverage:.2f}")
    else:
        print("没有对比到任何文件。")


[{'matched_edges': 4, 'total_static_edges': 12, 'total_llm_edges': 12, 'edge_coverage': 0.3333333333333333, 'filename': '95.json'}, {'matched_edges': 0, 'total_static_edges': 0, 'total_llm_edges': 5, 'edge_coverage': 0.0, 'filename': '110.json'}, {'matched_edges': 0, 'total_static_edges': 0, 'total_llm_edges': 5, 'edge_coverage': 0.0, 'filename': '160.json'}, {'matched_edges': 3, 'total_static_edges': 9, 'total_llm_edges': 10, 'edge_coverage': 0.3333333333333333, 'filename': '94.json'}, {'matched_edges': 30, 'total_static_edges': 121, 'total_llm_edges': 121, 'edge_coverage': 0.24793388429752067, 'filename': '38.json'}, {'matched_edges': 20, 'total_static_edges': 82, 'total_llm_edges': 48, 'edge_coverage': 0.24390243902439024, 'filename': '21.json'}, {'matched_edges': 0, 'total_static_edges': 1, 'total_llm_edges': 4, 'edge_coverage': 0.0, 'filename': '187.json'}, {'matched_edges': 0, 'total_static_edges': 2, 'total_llm_edges': 4, 'edge_coverage': 0.0, 'filename': '121.json'}, {'matched_