In [2]:
from multiprocessing import cpu_count
import os
import json

from llm import get_llm_answers

def extract_call_graph(code: str, model_name="gpt-4o") -> dict:
    """
    让 LLM 分析给定 code，并返回一个形如:
      {
        "functionName": [ "callee1", "callee2", ... ],
        "ClassName.methodName": [...],
        ...
      }
    的调用关系字典。
    """
    prompt = f"""You are a precise code analyzer.

Given the user-supplied source code, produce ONLY a JSON object describing the call graph of functions, classes, or methods **defined within the same file**. Do NOT include any calls to imported modules, external libraries, or built-in functions.

Follow these rules carefully:

1) **main node**:
   - Create a special key "main" for the top-level executable code in this file:
     a) Any statements not inside a function or class definition belong to "main", as do statements in an `if __name__ == "__main__":` block.
     b) Do NOT include import statements or function/class definitions in "main".
     c) "main" strictly represents the “entry point” code, not the entire file contents.

2) **Keys (function, class, method definitions, including nested)**:
   - For each definition in this file, create a key reflecting its full dotted path. For example:
     - Top-level function: `"func_name"`.
     - Top-level class: `"ClassName"`.
     - Method in a class: `"ClassName.method_name"`.
     - Nested class: `"OuterClass.InnerClass"` (and methods as `"OuterClass.InnerClass.method"`).
     - Nested function: `"outer_function.inner_function"`.
   - Do not skip nested definitions. Each definition in this file must have a dotted key.

3) **Values (calls within each function/method/‘main’ scope)**:
   - Each key’s value is an array of unique calls to other definitions **in the same file**, using the same dotted notation if applicable.
   - Only count a function/method/class call if parentheses are used (e.g. `something(...)`). Attribute or variable access without `(...)` does not count as a call.
   - If you see `MyClass(...)` and `MyClass` is defined in this file, treat it as a call to `"MyClass"`.
   - If the same function/method/class is called multiple times in a single scope, list it only once.
   - Ignore any import references, built-in functions, or external modules. We only care about calls to definitions that appear in this file.

4) **Valid JSON only**:
   - Output must be valid JSON, with no extra commentary or text.
   - Example (adjust to actual code):
     ```json
    {{
      "main": [
        "helper_function",
        "MyClass"
      ],
      "helper_function": [],
      "MyClass": [
        "MyClass.inner_method"
      ],
      "MyClass.inner_method": []
    }}
    ```

Now, here is the Python source code to analyze:
```
{code}
```
"""
    call_graph = json.loads(get_llm_answers(
        prompt,
        model_name=model_name,
        temperature=0,
        require_json=True
    ))
    
    return call_graph

def process_file(py_path: str, out_path: str):
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
        
    call_graph = extract_call_graph(code)

    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(call_graph, out_f, indent=4, ensure_ascii=False)

def batch_process_folder(folder_path: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    from concurrent.futures import ThreadPoolExecutor
    from tqdm import tqdm
    
    def process_single_file(fname):
        full_path = os.path.join(folder_path, f"{fname}.py")
        if not os.path.exists(full_path):
            return
        out_name = f"{fname}.json"
        out_path = os.path.join(output_dir, out_name)
        if os.path.exists(out_path):
            return
        process_file(full_path, out_path)
        
    files = []
    for i in range(200):
        files.append(f"{i}")

    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files))
        for i in range(len(files)):
            future = executor.submit(process_single_file, files[i])
            futures.append(future)
        for future in futures:
            future.result()
            pbar.update(1)
        pbar.close()

if __name__ == "__main__":
    batch_process_folder("../../dataset/python", "llm_cg_inside_file")

100%|██████████| 200/200 [00:23<00:00,  8.57it/s]
