In [3]:
from multiprocessing import cpu_count
import os
import json

from llm import get_llm_answers

def extract_call_graph(code: str, model_name="gpt-4o") -> dict:
    """
    让 LLM 分析给定 code，并返回一个形如:
      {
        "functionName": [ "callee1", "callee2", ... ],
        "ClassName.methodName": [...],
        ...
      }
    的调用关系字典。
    """
    example = """package png
let intSize = 32 << ((!UInt64(0)) >> 63)
func abs(x: Int64): Int64 {
    let m = x >> (intSize - 1)
    (x ^ m) - m
}
func paeth(a: UInt8, b: UInt8, c: UInt8): UInt8 {
    var pc = Int64(c)
    var pa = Int64(b) - pc
    var pb = Int64(a) - pc
    pc = abs(pa + pb)
    pa = abs(pa)
    pb = abs(pb)
    if (pa <= pb && pa <= pc) {
        a
    } else if (pb <= pc) {
        b
    } else {
        c
    }
}
func filterPaeth(cdat: Array<UInt8>, pdat: Array<UInt8>, bytesPerPixel: Int64): Unit {
    var a: Int64
    var b: Int64
    var c: Int64
    var pa: Int64
    var pb: Int64
    var pc: Int64
    for (i in 0..bytesPerPixel) {
        a = 0
        c = 0
        for (j in i..cdat.size : bytesPerPixel) {
            b = Int64(pdat[j])
            pa = b - c
            pb = a - c
            pc = abs(pa + pb)
            pa = abs(pa)
            pb = abs(pb)
            if (pa <= pb && pa <= pc) {
                // No-op.
            } else if (pb <= pc) {
                a = b
            } else {
                a = c
            }
            a += Int64(cdat[j])
            a &= 0xff
            cdat[j] = UInt8(a)
            c = b
        }
    }
}
"""
    prompt = f"""You are a Cangjie call graph generator. Analyze the following Cangjie code and output the call graph in the JSON format below. The keys are in the format:
"<className>.<methodName>" or "<functionName>" for functions outside classes.
The values are arrays of called methods or functions in the format:
"<className>.<methodName>" or "<functionName>."
Your job is to infer the call graph, including:
1. Class initializations (e.g., calling `MyClass()` to construct an instance).
2. Method calls within classes or instances.

**Important Notes:**
1. If there is no call graph, output an empty dictionary.
2. Key should be the function or class defined in the code, and value should be the function or class' name.

Here's an example of nested class:
```cj
{example}
```

Then the call graph should be:
{{
    "paeth": [
        "Int64",
        "abs"
    ],
    "filterPaeth": [
        "Int64",
        "UInt8",
        "abs"
    ]
}}

**Input Cangjie Code:**
{code}
"""
    call_graph = json.loads(get_llm_answers(
        prompt,
        model_name=model_name,
        temperature=0,
        require_json=True
    ))
    
    return call_graph

def process_file(py_path: str, out_path: str):
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
        
    call_graph = extract_call_graph(code)

    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(call_graph, out_f, indent=4, ensure_ascii=False)

def batch_process_folder(folder_path: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    from concurrent.futures import ThreadPoolExecutor
    from tqdm import tqdm
    
    def process_single_file(fname):
        full_path = os.path.join(folder_path, f"{fname}.cj")
        if not os.path.exists(full_path):
            return
        out_name = f"{fname}.json"
        out_path = os.path.join(output_dir, out_name)
        if os.path.exists(out_path):
            return
        process_file(full_path, out_path)
        
    files = []
    # process_single_file("0")

    for i in range(200):
        files.append(f"{i}")

    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files))
        for i in range(len(files)):
            future = executor.submit(process_single_file, files[i])
            futures.append(future)
        for future in futures:
            future.result()
            pbar.update(1)
        pbar.close()

if __name__ == "__main__":
    batch_process_folder("../../dataset/cangjie", "llm_cg")


100%|██████████| 200/200 [00:31<00:00,  6.27it/s]
