In [1]:
llm_cg_dir = "llm_cg"
static_cg_dir = "../../dataset/ts_cg"
source_dir = "../../dataset/ts"

import os
import json

datas = []

for i in range(200):
    llm_cg_path = os.path.join(llm_cg_dir, f"{i}.json")
    static_cg_path = os.path.join(static_cg_dir, f"{i}.ts.json")
    source_path = os.path.join(source_dir, f"{i}.ts")

    if not os.path.exists(llm_cg_path) or not os.path.exists(static_cg_path) or not os.path.exists(source_path):
        continue

    with open(source_path, "r", encoding="utf-8") as f:
        source_code = f.read()

    with open(llm_cg_path, "r", encoding="utf-8") as f:
        llm_cg = json.load(f)  
    with open(static_cg_path, "r", encoding="utf-8") as f:
        static_cg = json.load(f)

    data = {    
        "source_code": source_code,
        "llm_cg": llm_cg,
        "static_cg": static_cg
    }
    datas.append(data)

with open("cg_task.jsonl", "w", encoding="utf-8") as f:
    for data in datas:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")



In [3]:
from llm import get_llm_answers
import json
import concurrent.futures
import os

def process_line(line):
    data = json.loads(line)
    source_code = data["source_code"]
    llm_cg = data["llm_cg"]
    static_cg = data["static_cg"]

    prompt = f""" I will give you a typescript code and its call graph generated by llm and static analysis.
You can treat the static call graph as the ground truth.
Please compare the two call graphs and evaluate the llm call graph.
TypeScript code:
{source_code}
LLM call graph:
{llm_cg}
Static call graph:
{static_cg} 

Sum of LLM call graph: {sum(len(v) for v in llm_cg.values())}
Sum of Static call graph: {sum(len(v) for v in static_cg.values())}

However, not all the calls in the static call graph need to be considered.
We just need to consider the calls that **defined** in the source code. The calls such as **imported** are not considered.

Attention:
The format of the call graph is a dictionary, where the key is the function name and the value is a list of function calls.

Don't just compare the function name, maybe the function name is different but the function is the same.

Your output should be a json with the following format:
Don't just compare the function name, maybe the function name is different but the function is the same.

Your output should be a json with the following format:
{{
    "sum_call_from_static": number,
    "correct_call_from_llm": number,
    "missing_call_from_llm": number,
    "extra_call_from_llm": number
}}
"""
    return json.loads(get_llm_answers(prompt, model_name="deepseek-chat", require_json=True))

jsonl_file = "cg_task.jsonl"
results = []

# 创建输出目录
os.makedirs("results_local", exist_ok=True)

with open(jsonl_file, "r", encoding="utf-8") as f:
    lines = f.readlines()
    # 解析每一行获取文件名
    files = [f"ts_{i}.ts" for i in range(200)]
    
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_to_line = {executor.submit(process_line, line): i for i, line in enumerate(lines)}
    
    for future in concurrent.futures.as_completed(future_to_line):
        line_index = future_to_line[future]
        try:
            result = future.result()
            # 添加文件名到结果中
            result["file_name"] = files[line_index]
            results.append(result)
            # 保存单个结果
            with open(f"results_local/result_{line_index}.json", "w") as f:
                json.dump(result, f, indent=4)
        except Exception as e:
            print(f'处理第 {line_index} 行时发生错误: {str(e)}')

# 保存所有结果
with open("results_local/all_results.json", "w") as f:
    json.dump(results, f, indent=4)


In [2]:
import json
results = json.load(open("results_local/all_results.json"))

# 计算总体指标
total_correct = sum(r["correct_call_from_llm"] for r in results)
total_static = sum(r["sum_call_from_static"] for r in results)
total_llm = sum(r["correct_call_from_llm"] + r["extra_call_from_llm"] for r in results)

precision = round(total_correct / total_llm, 4) if total_llm > 0 else 0
recall = round(total_correct / total_static, 4) if total_static > 0 else 0
f1_score = round(2 * (precision * recall) / (precision + recall), 4) if precision + recall > 0 else 0

print("\n总体指标:")
print(f"Total files: {len(results)}")
print(f"Total calls from static: {total_static}")
print(f"Total calls from llm: {total_llm}")
print(f"Total calls from llm matched: {total_correct}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


总体指标:
Total files: 128
Total calls from static: 307
Total calls from llm: 772
Total calls from llm matched: 227
Precision: 0.294
Recall: 0.7394
F1 Score: 0.4207
