In [9]:
from multiprocessing import cpu_count
import os
import json

from llm import get_llm_answers

def extract_call_graph(code: str, model_name="gpt-4o") -> dict:
    """
    让 LLM 分析给定 code，并返回一个形如:
      {
        "functionName": [ "callee1", "callee2", ... ],
        "ClassName.methodName": [...],
        ...
      }
    的调用关系字典。
    """
    example = """from backend.blocks.hubspot._auth import (
    HubSpotCredentials,
    HubSpotCredentialsField,
    HubSpotCredentialsInput,
)
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
from backend.data.model import SchemaField
from backend.util.request import requests


class HubSpotCompanyBlock(Block):
    class Input(BlockSchema):
        credentials: HubSpotCredentialsInput = HubSpotCredentialsField()
        operation: str = SchemaField(
            description="Operation to perform (create, update, get)", default="get"
        )
        company_data: dict = SchemaField(
            description="Company data for create/update operations", default={}
        )
        domain: str = SchemaField(
            description="Company domain for get/update operations", default=""
        )

    class Output(BlockSchema):
        company: dict = SchemaField(description="Company information")
        status: str = SchemaField(description="Operation status")

    def __init__(self):
        super().__init__(
            id="3ae02219-d540-47cd-9c78-3ad6c7d9820a",
            description="Manages HubSpot companies - create, update, and retrieve company information",
            categories={BlockCategory.CRM},
            input_schema=HubSpotCompanyBlock.Input,
            output_schema=HubSpotCompanyBlock.Output,
        )

    def run(
        self, input_data: Input, *, credentials: HubSpotCredentials, **kwargs
    ) -> BlockOutput:
        base_url = "https://api.hubapi.com/crm/v3/objects/companies"
        headers = {
            "Authorization": f"Bearer {credentials.api_key.get_secret_value()}",
            "Content-Type": "application/json",
        }

        if input_data.operation == "create":
            response = requests.post(
                base_url, headers=headers, json={"properties": input_data.company_data}
            )
            result = response.json()
            yield "company", result
            yield "status", "created"

        elif input_data.operation == "get":
            search_url = f"{base_url}/search"
            search_data = {
                "filterGroups": [
                    {
                        "filters": [
                            {
                                "propertyName": "domain",
                                "operator": "EQ",
                                "value": input_data.domain,
                            }
                        ]
                    }
                ]
            }
            response = requests.post(search_url, headers=headers, json=search_data)
            result = response.json()
            yield "company", result.get("results", [{}])[0]
            yield "status", "retrieved"

        elif input_data.operation == "update":
            # First get company ID by domain
            search_response = requests.post(
                f"{base_url}/search",
                headers=headers,
                json={
                    "filterGroups": [
                        {
                            "filters": [
                                {
                                    "propertyName": "domain",
                                    "operator": "EQ",
                                    "value": input_data.domain,
                                }
                            ]
                        }
                    ]
                },
            )
            company_id = search_response.json().get("results", [{}])[0].get("id")

            if company_id:
                response = requests.patch(
                    f"{base_url}/{company_id}",
                    headers=headers,
                    json={"properties": input_data.company_data},
                )
                result = response.json()
                yield "company", result
                yield "status", "updated"
            else:
                yield "company", {}
                yield "status", "company_not_found"

"""
    prompt = f"""You are a Python call graph generator. Analyze the following Python code and output the call graph in the JSON format below. The keys represent every explicitly defined class, method, or function in the code, including nested ones. The values are arrays of directly called methods, initialized classes, or external functions, including builtin functions. However, if a method is called on an **instance of a class** (e.g., `animal.bark()` or `logger.info()`), it should **NOT** be included in the call graph.

**Rules:**
1. **Key Format**:
   - "<className>" for top-level classes.
   - "<className>.<nestedClassName>" for nested classes.
   - "<className>.<methodName>" for methods (including `__init__`).
   - "<className>.<nestedClassName>.<methodName>" for methods in nested classes.
   - "<functionName>" for standalone functions.
2. **Value Format**:
   - Include only the methods, classes, or functions directly called or initialized within the corresponding scope.
   - Builtin functions (e.g., `len`, `print`) should be included if they are explicitly called.
   - Do **NOT** include calls to methods made on instances of classes (e.g., `animal.bark()` or `logger.info()`).
3. **Special Cases**:
   - Classes or methods with no calls should still appear as keys with an empty list as their value.
   - For `main` execution, include all executable statements or function calls outside of any class or method, including `if __name__ == "__main__":`.

**Examples of Handling Builtins and Instances**:
- A direct call like `Animal.bark()` is included.
- A call on an instance like `animal.bark()` is excluded. That is to say, if a method is called on an instance of a class, it should NOT be included in the call graph.
- A builtin call like `len(some_list)` is included.
- A call on an instance like `logger.info()` is excluded. That is to say, if a method is called on an instance of a class, it should NOT be included in the call graph.

**Example:**
```python
{example}
```

**Example Output:**
{{
    "main": [],
    "HubSpotCompanyBlock.Input": [
        "backend.blocks.hubspot._auth.HubSpotCredentialsField",
        "backend.data.model.SchemaField"
    ],
    "HubSpotCompanyBlock.Output": [
        "backend.data.model.SchemaField"
    ],
    "HubSpotCompanyBlock.__init__": [
        "<builtin>.super"
    ],
    "HubSpotCompanyBlock.run": [
        "backend.util.request.requests.patch",
        "backend.util.request.requests.post"
    ]
}}

Example of a call on an instance:
```python
import logging

logger = logging.getLogger(__name__)

class Animal:
    def bark(self):
        print("woof")
        logger.info("not a function call")  # Instance method, exclude this!

    @staticmethod
    def eat():
        print("eat")

animal = Animal()
animal.bark()  # Instance call, exclude this!
Animal.eat()   # Class-level call, include this!
```

**Output:**
{{
    "main": [
        "Animal.eat",
        // Exclude "animal.bark" since it's an instance call
        "logging.getLogger"
    ],
    "Animal.bark": [
        "print"
        // Exclude "logger.info" since it's an instance call
    ],
    "Animal.eat": [
        "print"
    ]
}}

**Key Points in Example**:
1. **Builtin Functions**: `print` and `len` are included in the output if they are directly called in the code.
2. **Class Methods**: `MyClass.some_method` includes `global_function` and `print`, but does not include any methods called on instances.
3. **Instance Calls Excluded**: Methods called on instances, such as animal.bark() or logger.info(), are excluded.
4. **Nested Structures**: Nested classes and methods are represented as separate keys.

**Input Python Code:**
{code}
"""
    call_graph = json.loads(get_llm_answers(
        prompt,
        model_name=model_name,
        temperature=0,
        require_json=True
    ))
    
    return call_graph

def process_file(py_path: str, out_path: str):
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
        
    call_graph = extract_call_graph(code)

    with open(out_path, "w", encoding="utf-8") as out_f:
        json.dump(call_graph, out_f, indent=4, ensure_ascii=False)

def batch_process_folder(folder_path: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    from concurrent.futures import ThreadPoolExecutor
    from tqdm import tqdm
    
    def process_single_file(fname):
        full_path = os.path.join(folder_path, f"{fname}.py")
        if not os.path.exists(full_path):
            return
        out_name = f"{fname}.json"
        out_path = os.path.join(output_dir, out_name)
        if os.path.exists(out_path):
            return
        process_file(full_path, out_path)
        
    files = []
    for i in range(200):
        files.append(f"{i}")

    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        futures = []
        pbar = tqdm(total=len(files))
        for i in range(len(files)):
            future = executor.submit(process_single_file, files[i])
            futures.append(future)
        for future in futures:
            future.result()
            pbar.update(1)
        pbar.close()

if __name__ == "__main__":
    os.makedirs("./llm_cg", exist_ok=True)
    batch_process_folder("../../dataset/python", "llm_cg")


100%|██████████| 200/200 [00:35<00:00,  5.59it/s]
