In [1]:
from concurrent.futures import as_completed
from multiprocessing import cpu_count
import os
import json

from llm import get_llm_answers

def extract_call_graph(code: str, model_name="gpt-4o") -> dict:
    """
    让 LLM 分析给定 code，并返回一个形如:
      {
        "functionName": [ "callee1", "callee2", ... ],
        "ClassName.methodName": [...],
        ...
      }
    的调用关系字典。
    """
    prompt = f"""You are a TypeScript call graph generator. Analyze the following TypeScript code and output the call graph in the JSON format below. The keys are in the format:
"<className>.<methodName>(parameters)"
The values are arrays of called methods in the format:
"<className>.%AMX$<methodName>(parameters)"
where `%AMX$` refers to anonymous class IDs. Your job is to infer the call graph, even if the classes or methods are anonymous. Ensure accuracy and completeness in the output.

Don't output the system call, such as 'hilog', 'logger', 'console', only output the function call or class method call or class instance method call defined in the code.
If there is no call graph, output an empty dictionary.
**Example Output:**
{{
  "EntryAbility.onWindowStageCreate(window.WindowStage)": [
    "EntryAbility.%AM0$onWindowStageCreate(unknown, unknown)"
  ]
}}

**Input TypeScript Code:**
{code}
"""
    call_graph = json.loads(get_llm_answers(
        prompt,
        model_name=model_name,
        temperature=0,
        require_json=True
    ))
    
    return call_graph

def process_file(py_path: str, out_path: str):
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
        
    call_graph = extract_call_graph(code)

    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(call_graph, out_f, indent=4, ensure_ascii=False)

def batch_process_folder(folder_path: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    from concurrent.futures import ThreadPoolExecutor
    from tqdm import tqdm
    
    def process_single_file(fname):
        full_path = os.path.join(folder_path, f"{fname}.ts")
        if not os.path.exists(full_path):
            return
        out_name = f"{fname}.json"
        out_path = os.path.join(output_dir, out_name)
        if os.path.exists(out_path):
            return
        process_file(full_path, out_path)
        
    files = []
    # process_single_file("0")

    for i in range(200):
        files.append(f"{i}")

    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = [executor.submit(process_single_file, f) for f in files]
        for _ in tqdm(as_completed(futures), total=len(futures)):
            _.result()

if __name__ == "__main__":
    batch_process_folder("../../dataset/ts", "llm_cg")

100%|██████████| 200/200 [00:24<00:00,  8.04it/s]
