In [2]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.text_splitter import CodeSplitter
from llama_index.packs.code_hierarchy import CodeHierarchyNodeParser

import os
from IPython.display import Markdown, display
from dotenv import load_dotenv
load_dotenv()


def print_python(python_text):
    """This function prints python text in ipynb nicely formatted."""
    display(Markdown("```python\n" + python_text + "```"))

### Utility functions

In [7]:
from pathlib import Path
import re
import hashlib

def _skip_file(path: Path) -> bool:
    # skip lock files
    path = path.name
    if path.endswith("lock") or path == "package-lock.json" or path == "yarn.lock":
        return True
    # skip tests and legacy directories
    if path in ["legacy", "test"] and self.skip_tests:
        return True
    # skip hidden files
    if path.startswith("."):
        return True
    # skip images
    if path.endswith(".png") or path.endswith(".jpg"):
        return True
    return False

def _remove_non_ascii(text):
    # Define the regular expression pattern to match ascii characters
    pattern = re.compile(r"[^\x00-\x7F]+")
    # Replace ascii characters with an empty string
    cleaned_text = pattern.sub("", text)
    return cleaned_text

def _skip_directory(directory: Path) -> bool:
    # skip hidden directories
    if directory.name.startswith("."):
        return True
    return directory == "__pycache__" or directory == "node_modules"



## --- BASE PARSER
def generate_node_id(path: str, company_id: str):
    # Concatenate path and signature
    combined_string = f"{company_id}:{path}"
    hash_object = hashlib.md5()
    hash_object.update(combined_string.encode("utf-8"))
    # Get the hexadecimal representation of the hash
    node_id = hash_object.hexdigest()
    return node_id

def is_package(path: str) -> bool:
    return os.path.exists(os.path.join(path, "__init__.py"))

## Format node functions

In [5]:
import os

from llama_index.core.schema import BaseNode



class format_nodes:

    @staticmethod
    def format_plain_code_block_node(node: BaseNode, scope: dict, function_calls: list[str], file_node_id: str) -> dict:
        name = scope["name"]
        signature = scope["signature"]

        processed_node = {
            "type": "CODE_BLOCK",
            "attributes": {
                "name": name,
                "signature": signature,
                "text": node.text,
                "function_calls": function_calls,
                "file_node_id": file_node_id,
            },
        }

        return processed_node

    @staticmethod
    def format_function_node(node: BaseNode, scope: dict, function_calls: list[str], file_node_id: str) -> dict:
        name = scope["name"]
        signature = scope["signature"]

        processed_node = {
            "type": "FUNCTION",
            "attributes": {
                "name": name,
                "signature": signature,
                "text": node.text,
                "function_calls": function_calls,
                "file_node_id": file_node_id,
            },
        }

        return processed_node

    @staticmethod
    def format_class_node( node: BaseNode, scope: dict, file_node_id: str, inheritances: list[str], function_calls: list[str] ) -> dict:
        name = scope["name"]
        signature = scope["signature"]

        processed_node = {
            "type": "CLASS",
            "attributes": {
                "name": name,
                "signature": signature,
                "text": node.text,
                "file_node_id": file_node_id,
                "inheritances": inheritances,
                "function_calls": function_calls,
            },
        }

        return processed_node

    @staticmethod
    def format_file_node(node: BaseNode, no_extension_path: str, function_calls: list[str]) -> dict:
        processed_node = {
            "type": "FILE",
            "attributes": {
                "text": node.text,
                "function_calls": function_calls,
                "name": os.path.basename(no_extension_path),
            },
        }

        return processed_node

    @staticmethod
    def format_directory_node(path: str, package: bool, level: int) -> dict:
        processed_node = {
            "attributes": {
                "path": path + "/",
                "name": os.path.basename(path),
                "level": level,
            },
            "type": "PACKAGE" if package else "FOLDER",
        }

        return processed_node

In [6]:
from llama_index.core import SimpleDirectoryReader
from core.utils import GlobalGraphInfo


class GraphConstructor:
    global_graph_info: GlobalGraphInfo
    root: str
    skip_tests: bool
    parsers: Parsers
    max_workers: int = 50


    def __init__(self, entity_id: str, root: str, max_workers: Optional[int] = None):
        self.global_graph_info = GlobalGraphInfo(entity_id=entity_id)
        self.parsers = Parsers(self.global_graph_info, root)
        self.root = root
        self.skip_tests = True
        if max_workers is not None:
            self.max_workers = max_workers
            

    def build_graph(self):
        # process every node to create the graph structure
        print("Building graph...")
        start_time = time.time()

        nodes, relationships, imports = self._scan_directory(self.root)

        # relate imports between file nodes
        relationships.extend(self._relate_imports(imports))
        # relate functions calls
        relationships.extend(self._relate_constructor_calls(nodes, imports))
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")
        return nodes, relationships




    
    """
        helpers
    """
    def _scan_directory(self, path: str, parent_id: Optional[str] = None, level: int = 0, visited: Optional[Set[str]] = None,
    ) -> Tuple[List[Dict], List[Dict], Dict]:

        if visited is None:
            visited = set()

        nodes: List[Dict] = []
        relationships: List[Dict] = []
        imports: Dict = {}

        if not os.path.exists(path):
            raise FileNotFoundError(f"Directory {path} not found")
        if path.endswith("tests") or path.endswith("test"):
            return nodes, relationships, imports

        if path in visited:
            return nodes, relationships, imports
        visited.add(path)

        # Check if the directory is a package, logic for python
        package = is_package(path)

        core_directory_node = format_nodes.format_directory_node(path, package, level)
        directory_node_id = generate_node_id(path, self.global_graph_info.entity_id)
        directory_node = {
            **core_directory_node,
            "attributes": {**core_directory_node["attributes"], "node_id": directory_node_id},
        }
        print (directory_node)
        nodes.append(directory_node)

        if parent_id is not None:
            # relationship only exists when we are recursing (DFS)
            relationships.append(
                {
                    "sourceId": parent_id,
                    "targetId": directory_node_id,
                    "type": "CONTAINS",
                }
            )
        try:
            entries = list(os.scandir(path))
        except PermissionError:
            print(f"Permission denied: {path}")
            return nodes, relationships, imports

        with ThreadPoolExecutor(max_workers=min(self.max_workers, os.cpu_count() or 1)) as executor:
            # Submit all entries to the executor
            future_to_entry = {executor.submit(process_entry, entry): entry for entry in entries}
            for future in as_completed(future_to_entry):
                try:
                    entry_nodes, entry_relationships, entry_imports, entry_visited = future.result()
                    nodes.extend(entry_nodes)
                    relationships.extend(entry_relationships)
                    imports.update(entry_imports)
                    visited.update(entry_visited)
                except Exception as exc:
                    entry = future_to_entry[future]
                    print(f"Generated an exception: {entry.path} -> {exc}")
                    traceback.print_exc()

        return nodes, relationships, imports

    
    def process_entry(entry) -> Tuple[List[Dict], List[Dict], Dict, Set[str]]:
        local_nodes: List[Dict] = []
        local_relationships: List[Dict] = []
        local_imports: Dict = {}
        local_visited: Set[str] = set()

        if self._skip_file(entry.name):
            return local_nodes, local_relationships, local_imports, local_visited

        if entry.is_file():
            parser: BaseParser | None = self.parsers.get_parser(entry.name)
            # If the file is a supported language, parse it
            if parser:
                entry_name = entry.name.split(parser.extension)[0]
                try:
                    processed_nodes, relations, file_imports = parser.parse_file(
                        entry.path,
                        self.root,
                        global_graph_info=self.global_graph_info,
                        level=level,
                    )
                except Exception:
                    print(f"Error parsing file {entry.path}")
                    print(traceback.format_exc())
                    return local_nodes, local_relationships, local_imports, local_visited

                if processed_nodes:
                    file_root_node_id = processed_nodes[0]["attributes"]["node_id"]
                    local_nodes.extend(processed_nodes)
                    local_relationships.extend(relations)
                    local_relationships.append(
                        {
                            "sourceId": directory_node_id,
                            "targetId": file_root_node_id,
                            "type": "CONTAINS",
                        }
                    )
                    local_imports.update(file_imports)

                    global_import_key = (directory_path + entry_name).replace("/", ".")
                    self.global_graph_info.imports[global_import_key] = {
                        "id": file_root_node_id,
                        "type": "FILE",
                        "node": processed_nodes[0],
                    }
                else:
                    self.global_graph_info.import_aliases.update(file_imports)
            else:
                try:
                    with open(entry.path, "r", encoding="utf-8") as file:
                        text = file.read()
                except UnicodeDecodeError:
                    print(f"Error reading file {entry.path}")
                    return local_nodes, local_relationships, local_imports, local_visited

                path = str(entry.path).replace("/", ".")
                file_node = {
                    "type": "FILE",
                    "attributes": {
                        "path": path,
                        "file_path": path,
                        "name": entry.name,
                        "node_id": BaseParser.generate_node_id(path, self.global_graph_info.entity_id),
                        "text": text,
                    },
                }
                local_nodes.append(file_node)
                local_relationships.append(
                    {
                        "sourceId": directory_node_id,
                        "targetId": file_node["attributes"]["node_id"],
                        "type": "CONTAINS",
                    }
                )
        elif entry.is_dir():
            if self._skip_directory(entry.name):
                return local_nodes, local_relationships, local_imports, local_visited

            sub_nodes, sub_relationships, sub_imports = self._scan_directory(
                entry.path, directory_node_id, level + 1, visited
            )
            local_nodes.extend(sub_nodes)
            local_relationships.extend(sub_relationships)
            local_imports.update(sub_imports)

        return local_nodes, local_relationships, local_imports, local_visited

IndentationError: expected an indented block (4139504019.py, line 49)