In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.text_splitter import CodeSplitter
from llama_index.packs.code_hierarchy import CodeHierarchyNodeParser

from pathlib import Path
import os
from IPython.display import Markdown, display
from dotenv import load_dotenv
load_dotenv()


def print_python(python_text):
    """This function prints python text in ipynb nicely formatted."""
    display(Markdown("```python\n" + python_text + "```"))


def print_tail(text, n=1000):
    """This function prints the last n lines of text in ipynb nicely formatted."""
    lines = text.split("\n")
    last_n = lines[-n:]
    display(Markdown("```\n" + "\n".join(last_n) + "\n```"))


### Utility funcs

In [2]:
import re
from pathlib import Path

def _skip_file(path: Path) -> bool:
    # skip lock files
    path = path.name
    if path.endswith("lock") or path == "package-lock.json" or path == "yarn.lock":
        return True
    # skip tests and legacy directories
    if path in ["legacy", "test"] and self.skip_tests:
        return True
    # skip hidden files
    if path.startswith("."):
        return True
    # skip images
    if path.endswith(".png") or path.endswith(".jpg"):
        return True
    return False

def _remove_non_ascii(text):
    # Define the regular expression pattern to match ascii characters
    pattern = re.compile(r"[^\x00-\x7F]+")
    # Replace ascii characters with an empty string
    cleaned_text = pattern.sub("", text)
    return cleaned_text

def _skip_directory(directory: Path) -> bool:
    # skip hidden directories
    if directory.name.startswith("."):
        return True
    return directory == "__pycache__" or directory == "node_modules"

### Directory reader llamaindex
Note: it throws an error when there are empty files

In [6]:
def simple_directory_reader(path: str):
    try:
        documents = SimpleDirectoryReader(
            input_dir=path,
            recursive=True,
            required_exts=[".py"],
            exclude=[
                ".venv/**",
                ".vscode/**",
                "**/*.ipynb"
            ],
            file_metadata=lambda x: {"filepath": x}
        ).load_data()
        print(f"Number of documents loaded: {len(documents)}")
        new_docs = []
        if len(documents) > 0:
            for doc in documents:
                if len(doc.text) > 0:
                    # bug related to llama_index that happens in empty files too.
                    doc.set_content(_remove_non_ascii(doc.text))
                    new_docs.append(doc)
                    print(f"...... File path: {doc.metadata.get('filepath')}")
                # print("-" * 50)
        return new_docs
    except ValueError as e:
        if "No files found" in str(e):
            return []
        else:
            raise  # Re-raise if it's a different ValueError

documents = simple_directory_reader('../')

Number of documents loaded: 14
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../app.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../code_chunker.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../code_graph.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../core/graph.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../database/snippet_database.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../database/vector_store.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../embedding/context_wrapper.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../embedding/embedding.py
...... File path: /Users/shivanshj/repo-personal-projects/codebase2vec/examples/../embedding/llm_adapter.py
...... File path: /Users/shivanshj/repo-

Codesplitter works with full directory

In [4]:

try:   
    splitter = CodeSplitter(
        language="python",
        max_chars=1500,
        chunk_lines=20,
    )
    split_nodes = splitter.get_nodes_from_documents(documents)
    print(f"Successfully split into {len(split_nodes)} nodes")
    
    # Print first few nodes to verify content
    for i, node in enumerate(split_nodes[1:4]):
        print(f"\n------ Node {i} preview:")
        print(f"Text length: {len(node.text)}")
        print(f"Content preview: {node.text[:100]}...")

except Exception as e:
    print(f"Error: {type(e)}: {str(e)}")

Successfully split into 92 nodes

------ Node 0 preview:
Text length: 21
Content preview: class CodebaseLoader:...

------ Node 1 preview:
Text length: 1294
Content preview: def __init__(self, local_dir=None, github_repo=None):
        self.local_dir = local_dir
        sel...

------ Node 2 preview:
Text length: 1138
Content preview: def __load_local_codebase(self, directory) -> list[Snippet]:
        snippets = []
        for filen...


----
### CodeHierarchy doesn't work with full directory, when recursive is True

That's because it doesn't work well with empty files and we have to modify empty files on our own

In [None]:
from llama_index.core.schema import NodeWithScore


def load_and_split_code(path: Path) -> list[NodeWithScore]:
    print ('Path: ', str(path))
    if path.is_dir() and _skip_directory(path):
        print ('skipping')
        return []
    # -- 1. Get documents in directory
    documents = simple_directory_reader(str(path))
    if not len(documents) > 0:
        return []
    print(f"Number of documents loaded in {path}: {len(documents)}") 
    # -- 2. Split the documents into nodes
    code_hierarchy = CodeHierarchyNodeParser(
        language="python",
        code_splitter=CodeSplitter(language="python", max_chars=1000, chunk_lines=10),
    )
    print ('...... code_hierarchy found')
    split_nodes = []
    # try:
    split_nodes = code_hierarchy.get_nodes_from_documents(documents)
    print (len(split_nodes))
    # except Exception as e:
    #     print ('Exception', e, e.traceback())
    #     return []
    # -- 3. Recursively traverse all directories and combine all splitnodes
    # for sub_path in path.iterdir():
    #     if sub_path.is_dir():
    #         split_nodes.extend(load_and_split_code(sub_path))
    return split_nodes


split_nodes = load_and_split_code(Path('../').resolve())

### Quick Demo of agent pack

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.packs.code_hierarchy import CodeHierarchyAgentPack
llm = OpenAI(model="gpt-4", temperature=0.2)
pack = CodeHierarchyAgentPack(split_nodes=split_nodes, llm=llm)

import pandas as pd
pd.options.display.max_rows = 4000
print_tail(
    pack.run(
        "How does the Codebaseloader class from work? Provide specific implementation details."
    )
)

# Technical Exploreation

In [61]:
print_python(split_nodes[0].text)

```python
import os
import streamlit as st
import code_chunker
from embedding.llm_adapter import LLMAdapter
from github_interface import load_github_codebase
from embedding.embedding import CodeEmbedding  # Import the CodeEmbedding class
from dotenv import load_dotenv

from embedding.context_wrapper import Summarizer
from database.vector_store import VectorStore, VectorNode
from database.snippet_database import SnippetDatabase, Snippet
load_dotenv()

class CodebaseLoader:
    # Code replaced for brevity. See node_id ffad57b5-710b-489d-827b-ba0e3585377a
    






def main():
    # Code replaced for brevity. See node_id 9751f071-8a46-4560-9900-c34fb8fd7234






if __name__ == "__main__":
    main()```

In [62]:
print_python(split_nodes[1].text)
print ('------')
print_python(split_nodes[3].text)

```python
class CodebaseLoader:
    def __init__(self, local_dir=None, github_repo=None):
        # Code replaced for brevity. See node_id eb9526c8-f26a-45e2-b4c6-d6ba98594788

    def load_codebase(self) -> list[Snippet]:
        # Code replaced for brevity. See node_id 7be30beb-bf49-4f6f-809e-dbd69310830c

    def __test(self, txt):
        return txt
    
    def extract_dir_structure(self, snippets: list[Snippet]):
        # Code replaced for brevity. See node_id 365a6daa-c6bc-4a6b-b9da-b916fb564398

    def __load_local_codebase(self, directory) -> list[Snippet]:
        # Code replaced for brevity. See node_id 571c79ff-06ff-42a3-b87c-cd3f47fffcb4

    @staticmethod
    def is_valid_file(filepath):
        # Code replaced for brevity. See node_id 28e2997e-6b7c-4293-b866-35ec06e4de9a```

------


```python
def load_codebase(self) -> list[Snippet]:
        if self.db.repo_exists(self.repo_id):
            print ("CodebaseLoader :  repo exists in relational DB")
            return self.db.load_snippets(self.repo_id)
        
        if self.github_repo:
            self.snippets = load_github_codebase(self.github_repo)
        elif self.local_dir:
            self.snippets = self.__load_local_codebase(self.local_dir)
        self.db.save_repo_dir_structure(self.repo_id, self.extract_dir_structure(self.snippets))
        return self.__test(self.snippets)```

In [63]:
hierarchy, code_structure = CodeHierarchyNodeParser.get_code_hierarchy_from_nodes(split_nodes)
print("Code Hierarchy:")
print(code_structure)

Code Hierarchy:
- 
  - Users
    - shivanshj
      - repo-personal-projects
        - codebase2vec
          - app
            - CodebaseLoader
              - __init__
              - load_codebase
              - extract_dir_structure
              - __load_local_codebase
              - is_valid_file
            - main
              - make_embeddings
              - top_matches_from_vector_store
              - generate_code
          - code_chunker
            - chunk_code
            - Chunk
              - __post_init__
              - extract
              - extract_lines
              - __add__
              - __len__
            - _get_line_number_from_char_index
            - TextChunker
              - split_text
              - _chunk_node
              - __coalesce_chunks
            - BlockAwareCodeSplitter
              - __init__
              - split_text
              - _extract_blocks
              - _get_block_type
              - _get_block_name
              - _ov

# Load Query engine
Can load by class / function name

In [13]:
from llama_index.packs.code_hierarchy import CodeHierarchyKeywordQueryEngine

query_engine = CodeHierarchyKeywordQueryEngine(
    nodes=split_nodes,
)
print_python(query_engine.query("CodebaseLoader").response)

```python
class CodebaseLoader:
    def __init__(self, local_dir=None, github_repo=None):
        # Code replaced for brevity. See node_id b35ea862-ac60-4a63-b441-3249dda0ed68

    def load_codebase(self) -> list[Snippet]:
        # Code replaced for brevity. See node_id 2211354a-788c-4191-9168-a18c3842bd4b

    def __test(self, txt):
        return txt
    
    def extract_dir_structure(self, snippets: list[Snippet]):
        # Code replaced for brevity. See node_id 7e84f9b2-1b22-41e3-98b1-70534a4876cd

    def __load_local_codebase(self, directory) -> list[Snippet]:
        # Code replaced for brevity. See node_id 28adfb79-d64c-46f5-b1aa-5c26c761b292

    @staticmethod
    def is_valid_file(filepath):
        # Code replaced for brevity. See node_id fc245a91-a80d-41d0-8b9c-8ba0bcfe41cf```

## Abstract agent

In [14]:
from llama_index.core.tools import QueryEngineTool

tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="code_lookup",
    description="Useful for looking up information about the code hierarchy codebase.",
)

display(Markdown("Description: " + query_engine.get_tool_instructions()))

Description: Search the tool by any element in this list to get more information about that element.
If you see 'Code replaced for brevity' then a uuid, you may also search the tool with that uuid to see the full code.
You may need to use the tool multiple times to fully answer the user message.
The list is:
- 
  - Users
    - shivanshj
      - repo-personal-projects
        - codebase2vec
          - app
            - CodebaseLoader
              - __init__
              - load_codebase
              - extract_dir_structure
              - __load_local_codebase
              - is_valid_file
            - main
              - make_embeddings
              - top_matches_from_vector_store
              - generate_code
          - code_chunker
            - chunk_code
            - Chunk
              - __post_init__
              - extract
              - extract_lines
              - __add__
              - __len__
            - _get_line_number_from_char_index
            - TextChunker
              - split_text
              - _chunk_node
              - __coalesce_chunks
            - BlockAwareCodeSplitter
              - __init__
              - split_text
              - _extract_blocks
              - _get_block_type
              - _get_block_name
              - _overlaps
            - TestBlockAwareCodeSplitter
              - test_split_text
              - test_get_block_type
              - test_get_block_name
              - test_overlaps
          - code_graph
            - Relation
            - Node
          - database
            - snippet_database
              - SnippetDatabase
                - __init__
                - load_snippets
                - get_repo_dir_structure
                - make_repo_id
                - save_snippet
                - save_repo_dir_structure
                - repo_exists
              - TestSnippetDatabase
                - setUp
                - test_make_repo_id_github_url
                - test_make_repo_id_local_path
                - test_make_repo_id_unrecognized
            - vector_store
              - VectorNode
                - __init__
              - VectorStore
                - __init__
                - _connect_with_retry
                - does_embedding_exist
                - add_vectors
                - search
                - get_vectors_by_id
                - delete_nodes
                - _get_collection
                - _create_collection
                - delete_all_nodes
              - TestVectorStore
                - setUp
                - test_vector_connection
                - test_vector_search_by_id
                - tearDown
          - embedding
            - context_wrapper
              - Summarizer
                - context_of_snippet
                - generate_abstract_with_api
            - embedding
              - CodeEmbedding
                - __init__
                - generate_embeddings
                - find_k_nearest_neighbors
                - __cosine_similarity
              - EmbeddingStrategy
                - generate_embeddings
              - SentenceTransformerStrategy
                - generate_embeddings
              - LiteLLMStrategy
                - generate_embeddings
              - TokenizerStrategy
                - __init__
                - generate_embeddings
                - average_pool
            - llm_adapter
              - LLMAdapter
                - __init__
                - chat_completion
          - github_interface
            - fetch_github_repo_contents
            - load_github_codebase
          - openapi_understand
            - OpenAPIEmbedding
            - OpenAPISpecHandler
              - __init__
              - _load_openapi_spec
              - generate_embeddings
              - _create_endpoint_embeddings
              - find_endpoint_with_query
              - _get_endpoint_description
            - is_openapi_spec



In [15]:
from llama_index.llms.openai import OpenAI
from llama_index.packs.code_hierarchy import CodeHierarchyAgentPack
from llama_index.agent.openai import OpenAIAgent

llm = OpenAI(model="gpt-4", temperature=0.1)

agent = OpenAIAgent.from_tools(
    [tool], llm=llm, system_prompt=query_engine.get_tool_instructions(), verbose=True
)

In [16]:
response = agent.chat(
    "How does the codebaseloader function work? Provide specific implementation details."
)

Added user message to memory: How does the codebaseloader function work? Provide specific implementation details.
=== Calling Function ===
Calling function: code_lookup with args: {
  "input": "CodebaseLoader"
}
Got output: class CodebaseLoader:
    def __init__(self, local_dir=None, github_repo=None):
        # Code replaced for brevity. See node_id b35ea862-ac60-4a63-b441-3249dda0ed68

    def load_codebase(self) -> list[Snippet]:
        # Code replaced for brevity. See node_id 2211354a-788c-4191-9168-a18c3842bd4b

    def __test(self, txt):
        return txt
    
    def extract_dir_structure(self, snippets: list[Snippet]):
        # Code replaced for brevity. See node_id 7e84f9b2-1b22-41e3-98b1-70534a4876cd

    def __load_local_codebase(self, directory) -> list[Snippet]:
        # Code replaced for brevity. See node_id 28adfb79-d64c-46f5-b1aa-5c26c761b292

    @staticmethod
    def is_valid_file(filepath):
        # Code replaced for brevity. See node_id fc245a91-a80d-41d0-8b9c-

In [73]:
print(str(response))

The `CodebaseLoader` class is responsible for loading a codebase from a given repository. Here's how it works:

1. **Initialization (`__init__`)**: The constructor takes two optional arguments: `local_dir` and `github_repo`. These represent the local directory and the GitHub repository from which the codebase is to be loaded. The specific implementation details of the constructor have been replaced for brevity.

2. **Loading the Codebase (`load_codebase`)**: This method loads the codebase from the repository. It first checks if the repository exists in the database. If it does, it loads the snippets from the database. If it doesn't, it checks if the repository is a GitHub repository or a local directory and loads the codebase accordingly. It then saves the directory structure of the repository to the database and returns the loaded snippets.

3. **Testing (`__test`)**: This method is used to test the loaded snippets. The specific implementation details have been replaced for brevity.



<div style="
    color: red;
    border: 1px solid #ddd;
    border-radius: 4px;
    padding: 15px;
    margin: 10px 0;">

### Conclusion:
</div>

1. Llama-index doesn't work on empty files

2. It can't provide abstracts of a function within a function.
    a. Example:
    ```python
    def func_A():
        return func_B()
    ```