In [5]:
import json
import os
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from tqdm import tqdm
# 假设 get_llm_answers 从您的 llm 模块导入
from llm import get_llm_answers  

from typing import Set, Tuple

def convert_cfg_json_to_edges(cfg_json: dict) -> list[str]:
    """基于代码块内容生成控制流边，不依赖节点ID"""
    edges = []
    edge_counter = 0

    def get_block_label(block: dict) -> str:
        """提取标准化的块标签"""
        label = block.get('label', '')
        # 清理换行和多余空格
        return ' '.join(label.replace('\n', '\\n').strip().split())

    def process_entity(entity: dict):
        """递归处理各类实体（函数/类/块）"""
        # 处理当前实体的直接blocks
        for block in entity.get('blocks', []):
            process_block(block)

        # 递归处理嵌套结构
        for key in ['functions', 'classes']:
            for sub_entity in entity.get(key, []):
                process_entity(sub_entity)

    def process_block(block: dict):
        """处理单个代码块及其后继"""
        nonlocal edge_counter
        
        # 获取当前块的规范化标签
        source_label = get_block_label(block)
        if not source_label:
            return

        # 处理直接后继
        for succ in block.get('successors', []):
            target_label = get_block_label(succ)
            if target_label:
                edges.append(
                    f"Edge {edge_counter}: [Source] {source_label} => "
                    f"[Target] {target_label}"
                )
                edge_counter += 1

        # 递归处理嵌套blocks（如if/else内的块）
        for key in ['blocks', 'successors']:
            for sub_block in block.get(key, []):
                process_block(sub_block)

    # 从根节点开始处理
    process_entity(cfg_json)
    return edges

import re

def convert_cfg_dot_to_edges(cfg_dot: str) -> list[str]:
    nodes = {}
    # 匹配节点ID和内容（保留原始换行符）
    node_pattern = re.compile(r'Node(\d+)\s*\[label="((?:[^"\\]|\\.)*)"\]', re.DOTALL)
    for match in node_pattern.finditer(cfg_dot):
        node_id = int(match.group(1))
        label_content = match.group(2)
        # 将换行符转换为 \n，并去除每行首尾空格
        cleaned_content = "\\n".join(
            line.strip() for line in label_content.split('\n') if line.strip()
        )
        nodes[node_id] = cleaned_content
    
    edges = []
    # 匹配边的关系
    edge_pattern = re.compile(r'Node(\d+)\s*->\s*Node(\d+)')
    for match in edge_pattern.finditer(cfg_dot):
        source_id = int(match.group(1))
        target_id = int(match.group(2))
        edges.append((source_id, target_id))
    
    edge_descriptions = []
    for idx, (src, tgt) in enumerate(edges):
        src_content = nodes.get(src, "[Unknown Source]")
        tgt_content = nodes.get(tgt, "[Unknown Target]")
        # 生成单行描述
        edge_desc = f"Edge {idx}: [Source] {src_content} => [Target] {tgt_content}"
        edge_descriptions.append(edge_desc)
    
    return edge_descriptions

def get_prompt(code: str, llm_cfg: dict, static_cfg: dict):
    prompt = f"""
Role: Control Flow Graph Validation Specialist
Objective: Accurately compare CFG structures between static analysis (ground truth) and LLM generation

Ground Truth (static_cfg) and LLM Output (llm_cfg) follow:
[
    "Edge 0: [Source] node_A -> [Target] node_B",
    "Edge 1: [Source] node_C -> [Target] node_D",
    ...
]

### Comparison Criteria
1. Structure Matching:
   Match edges when:
   - Similar control flow patterns (sequential/conditional/loop)
   - Roughly equivalent position in code structure
   - Similar branching behavior
   - Note: Due to anonymous functions in static analysis, exact content matching is not required
   - Consider edges matched if they serve similar logical purposes

2. Mismatch Conditions:
   - Completely different control flow patterns
   - Edges that cannot be reasonably mapped to each other
   - Major structural differences that affect program logic

### Analysis Task
1. For static_cfg:
   - Count total edges as baseline
   - Identify main control flow patterns
   - Note that anonymous functions may create extra edges

2. For llm_cfg:
   - Count total generated edges
   - Use flexible matching to identify structurally similar edges
   - Focus on logical flow rather than exact matches

3. Output (JSON):
{{
  "edge_analysis": {{
    "static_total": "Number of edges from static analysis",
    "llm_total": "Number of edges generated by LLM",
    "matched_edges": {{
      "exact_matches": "Number of exactly matched edges (type + position)", 
      "partial_matches": "Number of type-matched edges with different positions"
    }},
    "accuracy_metrics": {{
      "precision": "exact_matches / llm_total",
      "recall": "exact_matches / static_total", 
      "f1_score": "2*(precision*recall)/(precision+recall)"
    }}
  }},
  "structure_validation": {{
    "missing_blocks": ["Unmatched static block IDs"],
    "extra_blocks": ["Extra LLM block IDs"]
  }}
}}

### Input Data
TypeScript Code:
{code}

Static Analysis CFG (Ground Truth):
{json.dumps(convert_cfg_dot_to_edges(static_cfg), indent=2)}

LLM Generated CFG:
{json.dumps(convert_cfg_json_to_edges(llm_cfg), indent=2)}

Output JSON analysis ONLY.
"""
    return prompt.strip()

from multiprocessing import cpu_count
from llm import get_llm_answers
import json
import os

results_dir = "results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def process_file(i):
    code_file = f"../../dataset/ts/{i}.ts"
    llm_cfg_file = f"merged_llm_cfg/{i}.json"
    static_cfg_file = f"../../dataset/ts_cfg/{i}.ts.dot"

    result_file = f"{results_dir}/{i}.json"

    if os.path.exists(result_file):
        return

    if not os.path.exists(code_file) or not os.path.exists(llm_cfg_file) or not os.path.exists(static_cfg_file):
        return

    code = open(code_file).read()
    llm_cfg = json.load(open(llm_cfg_file))
    static_cfg = open(static_cfg_file).read()
    prompt = get_prompt(code, llm_cfg, static_cfg)
    print(prompt)
    try:
        res = json.loads(get_llm_answers(prompt, model_name="gpt-4o", require_json=True))
    except Exception as e:
        print(e)
        return

    with open(f"{results_dir}/{i}.json", "w") as f:
        json.dump(res, f, indent=2)

with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    files = list(range(200))
    list(tqdm(executor.map(process_file, files), total=len(files), desc="处理文件"))
# process_file(0)


Role: Control Flow Graph Validation Specialist
Objective: Accurately compare CFG structures between static analysis (ground truth) and LLM generation

Ground Truth (static_cfg) and LLM Output (llm_cfg) follow:
[
    "Edge 0: [Source] node_A -> [Target] node_B",
    "Edge 1: [Source] node_C -> [Target] node_D",
    ...
]

### Comparison Criteria
1. Structure Matching:
   Match edges when:
   - Similar control flow patterns (sequential/conditional/loop)
   - Roughly equivalent position in code structure
   - Similar branching behavior
   - Note: Due to anonymous functions in static analysis, exact content matching is not required
   - Consider edges matched if they serve similar logical purposes

2. Mismatch Conditions:
   - Completely different control flow patterns
   - Edges that cannot be reasonably mapped to each other
   - Major structural differences that affect program logic

### Analysis Task
1. For static_cfg:
   - Count total edges as baseline
   - Identify main control flow p

处理文件:   0%|          | 0/200 [00:00<?, ?it/s]

Role: Control Flow Graph Validation Specialist
Objective: Accurately compare CFG structures between static analysis (ground truth) and LLM generation

Ground Truth (static_cfg) and LLM Output (llm_cfg) follow:
[
    "Edge 0: [Source] node_A -> [Target] node_B",
    "Edge 1: [Source] node_C -> [Target] node_D",
    ...
]

### Comparison Criteria
1. Structure Matching:
   Match edges when:
   - Similar control flow patterns (sequential/conditional/loop)
   - Roughly equivalent position in code structure
   - Similar branching behavior
   - Note: Due to anonymous functions in static analysis, exact content matching is not required
   - Consider edges matched if they serve similar logical purposes

2. Mismatch Conditions:
   - Completely different control flow patterns
   - Edges that cannot be reasonably mapped to each other
   - Major structural differences that affect program logic

### Analysis Task
1. For static_cfg:
   - Count total edges as baseline
   - Identify main control flow p

处理文件:   6%|▋         | 13/200 [00:30<07:16,  2.33s/it]

请求失败,已重试3次: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 157154 tokens. Please reduce the length of the messages. (request id: 20250207231927415097734pkJdsqHV) (request id: 20250207231923278334681rDkWufK)", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


处理文件: 100%|██████████| 200/200 [00:46<00:00,  4.33it/s]

Unterminated string starting at: line 57 column 7 (char 12632)





In [6]:
import os
import json
import numpy as np

def calculate_global_metrics(results_dir: str) -> dict:
    """精确的全局指标统计"""
    metrics = {
        "total_files": 0,
        "gt_edges": 0,
        "llm_edges": 0,
        "exact_matches": 0,
        "partial_matches": 0,
        "missing_blocks": set(),
        "extra_blocks": set(),
        "file_errors": []
    }

    # 遍历结果目录
    for filename in os.listdir(results_dir):
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(results_dir, filename)
        try:
            with open(filepath) as f:
                data = json.load(f)

            # 核心指标累加
            edge_analysis = data["edge_analysis"]
            metrics["gt_edges"] += edge_analysis["static_total"]
            metrics["llm_edges"] += edge_analysis["llm_total"]
            metrics["exact_matches"] += edge_analysis["matched_edges"]["exact_matches"]
            metrics["partial_matches"] += edge_analysis["matched_edges"]["partial_matches"]

            # 结构验证统计
            structure = data["structure_validation"]
            metrics["missing_blocks"].update(map(str, structure["missing_blocks"]))
            metrics["extra_blocks"].update(map(str, structure["extra_blocks"]))

            metrics["total_files"] += 1

        except Exception as e:
            metrics["file_errors"].append(f"{filename}: {str(e)}")
            continue

    # 计算衍生指标
    total_matched = metrics["exact_matches"] + metrics["partial_matches"]
    
    precision = total_matched / metrics["llm_edges"] if metrics["llm_edges"] > 0 else 0
    recall = total_matched / metrics["gt_edges"] if metrics["gt_edges"] > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "file_count": metrics["total_files"],
        "edge_metrics": {
            "static_total": metrics["gt_edges"],
            "llm_total": metrics["llm_edges"],
            "exact_matches": metrics["exact_matches"],
            "partial_matches": metrics["partial_matches"],
            "total_matched": total_matched,
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1_score": round(f1, 4)
        },
        "structure_metrics": {
            "missing_blocks_count": len(metrics["missing_blocks"]),
            "extra_blocks_count": len(metrics["extra_blocks"]),
            "missing_block_samples": list(metrics["missing_blocks"])[:5],  # 示例显示前5个
            "extra_block_samples": list(metrics["extra_blocks"])[:5]
        },
        "data_quality": {
            "error_files": len(metrics["file_errors"]),
            "error_samples": metrics["file_errors"][:3]  # 示例显示前3个错误
        }
    }

def print_report(metrics: dict):
    """格式化输出报告"""
    print("CFG 评估报告")
    print("=" * 50)
    print(f"分析文件总数: {metrics['file_count']}")
    
    print("\n[边匹配分析]")
    em = metrics['edge_metrics']
    print(f"静态分析总边数: {em['static_total']}")
    print(f"LLM生成总边数: {em['llm_total']}")
    print(f"精确匹配边数: {em['exact_matches']} ({em['exact_matches']/em['static_total']:.2%})")
    print(f"部分匹配边数: {em['partial_matches']} ({em['partial_matches']/em['static_total']:.2%})")
    print(f"总匹配边数: {em['total_matched']} ({em['total_matched']/em['static_total']:.2%})")
    print(f"精确率 (Precision): {em['precision']:.4f}")
    print(f"召回率 (Recall): {em['recall']:.4f}")
    print(f"F1 值: {em['f1_score']:.4f}")

    print("\n[结构验证]")
    sm = metrics['structure_metrics']
    print(f"缺失块总数: {sm['missing_blocks_count']} (示例: {', '.join(sm['missing_block_samples'])})")
    print(f"多余块总数: {sm['extra_blocks_count']} (示例: {', '.join(sm['extra_block_samples'])})")

    if metrics['data_quality']['error_files'] > 0:
        print("\n[数据质量问题]")
        print(f"错误文件数: {metrics['data_quality']['error_files']}")
        print("示例错误:")
        for err in metrics['data_quality']['error_samples']:
            print(f"  - {err}")

    print("=" * 50)

# 使用示例
results_dir = "results"
metrics = calculate_global_metrics(results_dir)
print_report(metrics)

CFG 评估报告
分析文件总数: 193

[边匹配分析]
静态分析总边数: 2351
LLM生成总边数: 4316
精确匹配边数: 612 (26.03%)
部分匹配边数: 720 (30.63%)
总匹配边数: 1332 (56.66%)
精确率 (Precision): 0.3086
召回率 (Recall): 0.5666
F1 值: 0.3996

[结构验证]
缺失块总数: 466 (示例: id:163, id:14, id:76, id:105, 22)
多余块总数: 948 (示例: // Utilities\nimport { clamp } from '@/util/helpers'\n// Types\nimport type { RGB, XYZ } from '../colorUtils', Edge 68: [Source] isTeamsUpdateProps({\n teamName: 'teamName', => [Target] extra: 'extra',\n data: {, Edge 0 (llm), Edge 1: [Source] function example(a) { => [Target] else {\nconsole.log(1);, Edge 212)
