In [1]:
from multiprocessing import cpu_count
import os
import json

from llm import get_llm_answers

def extract_call_graph(code: str, model_name="gpt-4o") -> dict:
    """
    让 LLM 分析给定 code，并返回一个形如:
      {
        "functionName": [ "callee1", "callee2", ... ],
        "ClassName.methodName": [...],
        ...
      }
    的调用关系字典。
    """
    prompt = f"""You are a precise code analyzer.

Given the user-supplied source code, produce ONLY a JSON object describing the call graph. Follow these rules carefully:

1) **main node**:
   - Create a special key "main" for the top-level executable code in this file:
     a) Any statements not inside a function or class definition belong to "main", as do statements in an `if __name__ == "__main__":` block.
     b) Do NOT include `import` statements or function/class definitions in "main".
     c) "main" is strictly the “entry point” code, not the entire file.

2) **Keys (function, class, or method definitions, including nested ones)**:
   - For each definition in this file, create a key that reflects its full dotted path.
     - Top-level function: `"func_name"`.
     - Top-level class: `"ClassName"`.
     - Method in a class: `"ClassName.method_name"`.
     - Class or function nested inside another class: `"OuterClass.InnerClass"` (and methods as `"OuterClass.InnerClass.method"`).
     - Function nested inside another function: `"outer_function.inner_function"`, etc.
   - Do not skip nested definitions; each gets its own dotted key.

3) **Values (calls within each function/method/‘main’ scope)**:
   - Each key’s value is an array of unique “call” targets used in that scope, as they appear in code. 
   - A “call” is identified by the presence of parentheses immediately after a symbol, e.g. `foo(...)`. 
     - If it's only `foo.attr` without `(...)`, treat it as an attribute/variable reference, **not** a function call.
   - Include calls to:
     a) Other functions/classes/methods defined in this file (with the same dotted notation).
     b) Imported modules/classes/functions (with the fullest path you can infer).
     c) Built-in functions, using the notation `"<builtin>.function_name"` (e.g. `"<builtin>.print"`, `"<builtin>.len"`).
   - For local imports like `import backend.blocks.basic` followed by `basic.PrintToConsoleBlock(...)`, record it as `"backend.blocks.basic.PrintToConsoleBlock"`.
   - If you see `MyClass(...)` and `MyClass` is defined in this file, treat it as `"MyClass"`, ignoring attribute references like `MyClass().some_attr`.
   - If the same function/method is called multiple times in a single scope, list it only once.
   - Calls inside a nested function/class belong only to that nested scope’s call list, not the outer function or class.

4) **Valid JSON only**:
   - Output must be valid JSON, with no extra commentary or text.
   - Example of final structure (adjust to the code):
     ```json
    {{
       "main": [
         "<builtin>.print",
         "os.path.join",
         "MyClass"
       ],
       "MyClass": [
         "AnotherClass"
       ],
       "MyClass.some_method": [],
       "AnotherClass": []
    }}
     ```

5) The import statements are not be called.
Now, here is the source code to analyze:  
```
{code}
```

Attention, only output the function/class/method call, don't output other like the identifier name.
"""
    call_graph = json.loads(get_llm_answers(
        prompt,
        model_name=model_name,
        temperature=0,
        require_json=True
    ))
    
    return call_graph

def process_file(py_path: str, out_path: str):
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
        
    call_graph = extract_call_graph(code)

    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(call_graph, out_f, indent=4, ensure_ascii=False)

def batch_process_folder(folder_path: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    from concurrent.futures import ThreadPoolExecutor
    from tqdm import tqdm
    
    def process_single_file(fname):
        full_path = os.path.join(folder_path, f"{fname}.ts")
        if not os.path.exists(full_path):
            return
        out_name = f"{fname}.json"
        out_path = os.path.join(output_dir, out_name)
        if os.path.exists(out_path):
            return
        process_file(full_path, out_path)
        
    files = []
    # process_single_file("0")

    for i in range(200):
        files.append(f"{i}")

    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files))
        for i in range(len(files)):
            future = executor.submit(process_single_file, files[i])
            futures.append(future)
        for future in futures:
            future.result()
            pbar.update(1)
        pbar.close()

if __name__ == "__main__":
    batch_process_folder("../../dataset/ts", "llm_cg")

100%|██████████| 200/200 [01:03<00:00,  3.13it/s]
