In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.vectorizer.codebase_vectorizer import CodebaseVectorizer
from src.embedding.ollama_embedder import OllamaEmbedder
from src.vector_store.chroma_store import ChromaStore
from langchain_community.vectorstores import Chroma
from tqdm import tqdm

In [2]:
# Global Variables
CODEBASE_PATH = "./sample_codebase/tiovx"
VECTOR_STORE_PATH = "./db/tiovx_vectors"
MODEL = "unclemusclez/jina-embeddings-v2-base-code:f16"
CODE_EXTENSIONS = {'.py', '.js', '.jsx', '.ts', '.tsx', '.cpp', '.c', '.h', '.mak', '.mk', '.cmake', '.sh', '.txt', '.md', '.json', '.yaml', '.yml', '.xml', '.html'}
BATCH_SIZE = 32


In [3]:
# create Embedder
embedder = OllamaEmbedder(
    url="http://localhost:11434",
    model_name=MODEL,
    timeout=120
)


# create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

In [4]:
# create vectorizer
vectorizer = CodebaseVectorizer(codebase_path=CODEBASE_PATH,
                                embedder=embedder,
                                text_splitter=text_splitter,
                                code_extensions=CODE_EXTENSIONS,
                                batch_size=BATCH_SIZE)

texts, metadatas = vectorizer.vectorize_codebase()

Starting vectorization of sample_codebase/tiovx
Found 889 code files
Created 17635 code chunks
Created 17635 code chunks


In [5]:
vector_store = Chroma(
    embedding_function=embedder,
    persist_directory=VECTOR_STORE_PATH
)

  vector_store = Chroma(


In [6]:
def embed_documents_in_batches(texts, metadatas, batch_size=16, max_workers=4):
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
        texts_batch = texts[i:i + batch_size]
        metadatas_batch = metadatas[i:i + batch_size]
        vector_store.add_texts(texts=texts_batch, metadatas=metadatas_batch)

embed_documents_in_batches(texts, metadatas, batch_size=BATCH_SIZE, max_workers=4)

Processing Batches: 100%|██████████| 552/552 [09:30<00:00,  1.03s/it]


In [7]:
def query_codebase(vector_store, query, top_k=5):
    results = vector_store.similarity_search_with_score(query, k=top_k)
    for i, (doc, score) in enumerate(results):
        print(f"Result {i+1}:")
        print(f"Score: {score}")
        print(f"Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}\n")


In [14]:
query = "vxEnableGraphStreaming "
top_k = 5

query_codebase(vector_store, query, top_k)

Result 1:
Score: 0.43948811292648315
Content: return status;
}

VX_API_ENTRY vx_status VX_API_CALL vxEnableGraphStreaming(vx_graph graph, vx_node trigger_node)
{
    vx_status status = (vx_status)VX_ERROR_INVALID_PARAMETERS;

    if(ownIsValidSpecificReference(vxCastRefFromGraph(graph), (vx_enum)VX_TYPE_GRAPH) != (vx_bool)vx_false_e)
    {
        graph->is_streaming_enabled = (vx_bool)vx_true_e;
        graph->is_pipelining_enabled = (vx_bool)vx_true_e;

        status = (vx_status)VX_SUCCESS;

        if(ownIsValidSpecificReference(vxCastRefFromNode(trigger_node), (vx_enum)VX_TYPE_NODE) != (vx_bool)vx_false_e)
        {
            int32_t i;

            for (i = 0; i < (int32_t)graph->num_nodes; i++)
            {
                if (graph->nodes[i] == trigger_node)
                {
                    graph->trigger_node_index = (uint32_t)i;
                    graph->trigger_node_set = (vx_bool)vx_true_e;
                    break;
                }
            }
Metadata: {'chu