# Document Loader

In [1]:
from langchain_community.document_loaders import GitLoader

In [2]:
def load_documents(path, branch):
    loader = GitLoader(
        repo_path=path,
        branch=branch
    )
    data = loader.load()
    return data

In [37]:
data = load_documents('/Users/pandoks/Projects/spotlight', 'setup')

In [4]:
print(len(data))
data_document = data[3].dict()
print(data_document)

8
{'page_content': 'from typing import List, TypedDict\nfrom langchain_community.embeddings.ollama import OllamaEmbeddings\nfrom langchain_openai import OpenAIEmbeddings\n\n\nclass OllamaEmbedDocumentsConfig(TypedDict):\n    texts: List[str]\n    model: str\n\n\ndef embed_documents(config: OllamaEmbedDocumentsConfig) -> List[List[float]]:\n    # embedder = OllamaEmbeddings(model=config["model"])\n    embedder = OpenAIEmbeddings()\n    embeddings = embedder.embed_documents(config["texts"])\n    return embeddings\n', 'metadata': {'source': 'retrieval/embeddings/langchain/ollama.py', 'file_path': 'retrieval/embeddings/langchain/ollama.py', 'file_name': 'ollama.py', 'file_type': '.py'}, 'type': 'Document'}


# Document Splitter

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
def split_text(documents, language, chunk_size, chunk_overlap):
    splitter = RecursiveCharacterTextSplitter.from_language(language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = splitter.split_documents(documents)
    return docs

In [7]:
split = split_text(data, "python", 1000, 100)

In [8]:
print(len(split))
print(split)

15
[Document(page_content='.nvim*', metadata={'source': '.gitignore', 'file_path': '.gitignore', 'file_name': '.gitignore', 'file_type': ''}), Document(page_content='import sys\nimport chromadb\nimport argparse\nimport uuid\nimport json\nfrom chromadb.utils import embedding_functions\n\nchromadb_client = None\nembedding_function = None\ncollection = None', metadata={'source': 'embeddings.py', 'file_path': 'embeddings.py', 'file_name': 'embeddings.py', 'file_type': '.py'}), Document(page_content='def main():\n    parser = argparse.ArgumentParser()\n    subparsers = parser.add_subparsers(dest="command")\n\n    store_parser = subparsers.add_parser("store")\n    store_parser.add_argument("--collection-name", type=str, required=True)\n    store_parser.add_argument("--model", type=str, required=True)\n    store_parser.add_argument("--db-location", type=str)\n    store_parser.add_argument("--file-location", type=str, required=True)\n\n    retrieve_parser = subparsers.add_parser("retrieve")\n 

In [9]:
texts = [doc.page_content for doc in split]
metadatas = [doc.metadata for doc in split]

# Embeddings

In [10]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

In [11]:
def embed_documents(texts, model):
    embedder = OllamaEmbeddings(model=model)
    embeddings = embedder.embed_documents(texts)
    return embeddings

In [12]:
# embeddings = embed_documents(texts, "llama3")

# Vector Store

In [13]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from hashlib import sha256

In [14]:
def setup_database(collection_name, embedding_function, persist_directory):
    database = Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=persist_directory
    )
    return database

In [15]:
database = setup_database("spotlight", OllamaEmbeddings(model="llama3-chatqa"), "/Users/pandoks/Projects/spotlight/.test")

In [16]:
def add(db, documents):
    documents_with_hashes = []
    for document in documents:
        document = document.dict()
        page_content = document["page_content"]
        hashed_page_content = sha256(page_content.encode("utf-8")).hexdigest()
        print(hashed_page_content)

        new_metadata = document["metadata"]
        new_metadata["hash"] = hashed_page_content

        query_filter = {
            "$and":
                [
                    {'source': new_metadata["source"]},
                    {'file_path': new_metadata["file_path"]},
                    {'file_name': new_metadata["file_name"]},
                    {'file_type': new_metadata["file_type"]},
                    {'hash': new_metadata["hash"]}
                ]
        }
        existing_documents = db.get(where=query_filter)
        print(existing_documents)
        if not len(existing_documents["ids"]):
            documents_with_hashes.append(Document(page_content, metadata=new_metadata))
        
    if not len(documents_with_hashes):
        return
    return db.add_documents(documents_with_hashes)

In [17]:
def delete(db, metadatas):
    for metadata in metadatas:
        query_filter_list = []
        for key, value in metadata.items():
            query_filter_list.append({key: value})
        query_filter = None
        if len(query_filter_list) == 1:
            query_filter = query_filter_list[0]
        else:
            query_filter = {
                "$and": query_filter_list
            }
        to_be_deleted_documents = db.get(where=query_filter)
        to_be_deleted_ids = to_be_deleted_documents["ids"]
        if not len(to_be_deleted_ids):
            return
        db.delete(ids=to_be_deleted_ids)
        return to_be_deleted_ids

In [18]:
def update(db, documents):
    metadatas = []
    for document in documents:
        metadata = document.dict()["metadata"]
        if metadata not in metadatas:
            metadatas.append(metadata)
    print(metadatas)
    delete(db, metadatas)
    return add(db, documents)

In [19]:
added_ids = add(database, split)

c9205effb86dc516823576184de6289b22fe2bcabec9821c8d289f424055dbea
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
a29b5858d0a2da42baeaa6182db28e72d576e5c60b1999aaf0e4d8a6068f42a7
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
b28bd2ae72559fa2e98c6839ac3d417e20303a39eb4aa6eba7565f7798e693b4
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
a2e4f8d5fd117c5ce705cb657cc4d14e9fb1a64449e621ccfc574077a9cc6a82
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
4370a904b403a602bb9eaef38c8028841566c864505a21eaf244134f431e28e1
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
07a29444784e21dd8a3f02c84d1f97e945b304f046dc3df2716147237d4d68b9
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
b6ddd2532873eedba34182a94240915cb10cab3e6fe61c

In [171]:
deleted_ids = delete(database, [{'file_type': '.py'}])

In [38]:
updated_ids = update(database, split)

[{'source': '.gitignore', 'file_path': '.gitignore', 'file_name': '.gitignore', 'file_type': ''}, {'source': 'embeddings.py', 'file_path': 'embeddings.py', 'file_name': 'embeddings.py', 'file_type': '.py'}, {'source': 'lua/spotlight/init.lua', 'file_path': 'lua/spotlight/init.lua', 'file_name': 'init.lua', 'file_type': '.lua'}, {'source': 'retrieval/embeddings/langchain/ollama.py', 'file_path': 'retrieval/embeddings/langchain/ollama.py', 'file_name': 'ollama.py', 'file_type': '.py'}, {'source': 'retrieval/loaders/langchain/git.py', 'file_path': 'retrieval/loaders/langchain/git.py', 'file_name': 'git.py', 'file_type': '.py'}, {'source': 'retrieval/splitters/langchain/code.py', 'file_path': 'retrieval/splitters/langchain/code.py', 'file_name': 'code.py', 'file_type': '.py'}, {'source': 'retrieval/stores/langchain/chroma.py', 'file_path': 'retrieval/stores/langchain/chroma.py', 'file_name': 'chroma.py', 'file_type': '.py'}]
c9205effb86dc516823576184de6289b22fe2bcabec9821c8d289f424055dbea


In [20]:
print(added_ids)

['c47ed45d-94ba-44f1-aee0-2dc5405c3ee2', '428abb56-e076-479e-9077-48fcd620cbad', '3a872450-26cb-45b9-8aa6-0555cdbc8b3a', '43c10404-4f01-425a-bbd3-c0ebf10cd008', 'd46c894c-37d3-478f-94e2-e3fa0fef0c58', 'ff9fe069-81e2-4ed0-90f2-3ed24494def1', 'eb78cc0e-2d54-4f0f-ab24-4056b11e7c1f', 'f93f67f3-05a4-4929-ae67-c60d37b0c3a6', '9db637c2-3668-41bf-ae00-e62ea2f97d9e', 'f1bd77a4-f5be-4c86-bc39-540feae5cf04', '8cf1a154-5b4b-49a1-a67b-23962f2fd48b', '3600e513-4df5-4080-b545-52f0b33aee5c', '705d0754-4e39-4883-bc60-946e9a7b2166', 'dfe75591-9d13-4b55-a537-2313f7dd4133', '3e107c0a-8e07-438f-bc1d-58c8fd52c31b']


In [173]:
print(deleted_ids)

['02950803-2033-47c9-9486-d353c44b0279', '04f52ff9-d13b-4004-aa37-2a542b4fbe36', '0895302d-ac9a-4cf1-a568-76eaf72dacc8', '09e031e7-e387-4568-8a59-99c907ff1ba4', '0a9a76d6-3337-4c78-81fa-1a4f54463931', '0f5b3525-3256-4ae9-8420-132c30a5e701', '0f8ecf4e-2da3-444b-a478-2ce9ef010ff1', '0f9da5b5-69c0-4e57-8e36-5e719588459b', '104a698d-0b9e-47af-9f8b-214d4f8db7ab', '11ab000d-5710-40cc-aa2a-5e39a0a96b35', '122ffc8c-e574-498d-a385-5dccc14e416c', '14ace4f9-bfea-43c8-a514-7fff5d4987a8', '17b5628f-796d-4ef1-9682-1cb54cc524d9', '1cb6e9fc-bd70-4b97-aebf-c46c0ea2982c', '1d3bec19-1eb7-48c6-94a7-bda255a59431', '1d636be2-4636-4ac8-9329-1b4bc8b1cafc', '1d857642-abad-48d8-9d18-c6c5802a3d44', '1de7550c-26ac-4cef-ba3c-dea3cef488a3', '1ec2c6ed-e6ee-444b-8546-b6dbb40e7041', '1ede520a-d0ee-48ca-bb2d-d4910902e90e', '1f622186-5b7b-4d9e-a243-edb244620a31', '20ec73b8-38c0-4cbe-a1a1-fbc2b0a9fafd', '231c42a2-b621-4f6d-bf3f-107a5336958f', '263e4b90-a35d-482c-84d6-003c2b2ff7f8', '269f8529-761a-45d6-b891-cfd8ae2913b7',

In [22]:
print(len(database.get()["ids"]))

0


In [39]:
storage = database.get()
metadatas = storage["metadatas"]
seenfiles = set()
for metadata in metadatas:
    filepath = metadata["file_path"]
    if filepath not in seenfiles:
        print(filepath)
        seenfiles.add(filepath)

.gitignore
lua/spotlight/init.lua
retrieval/embeddings/langchain/ollama.py
embeddings.py
retrieval/stores/langchain/chroma.py
retrieval/splitters/langchain/code.py
retrieval/loaders/langchain/git.py


# Retrieval

In [50]:
prompt = '''
"ollama"
'''

### Vector store-backed retriever

In [61]:
retriever = database.as_retriever(search_type="mmr")

In [62]:
documents = retriever.invoke(prompt)

### Native database query

In [59]:
documents = database.similarity_search(prompt)

### Parent Document Retriever

In [51]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [70]:
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=database,
    docstore=store,
    child_splitter=RecursiveCharacterTextSplitter.from_language("python", chunk_size=50, chunk_overlap=10),
    parent_splitter=RecursiveCharacterTextSplitter.from_language("python", chunk_size=200, chunk_overlap=10)
)

In [None]:
retriever.add_documents(data, ids=None)

In [54]:
list(store.yield_keys())

['3a615de4-fa9c-4117-92c1-e7af1df233b6',
 '7fb24242-156a-42b8-9905-9630e5237549',
 '490c9452-6b32-4a8c-833a-8f2473e22473',
 'ca21c008-c9f3-443e-93e8-a2786564c81b',
 '51e157b8-5497-4d48-b321-28671581735b',
 '645f402b-7627-4c99-bc85-ac3e0502fabb',
 '6b8f98ec-ea02-4a4e-b7b3-4ed958b26d1e',
 '13a278d3-b35c-419f-8430-e0b21e5ed97f']

In [None]:
documents = database.similarity_search(prompt)
print(documents[0].page_content)
retrieved_documents = retriever.invoke(prompt)
print(retrieved_documents)

### Custom Retriever

In [64]:
from langchain.retrievers import EnsembleRetriever

### Print Results

In [69]:
for document in documents:
    document = document.dict()
    print(document["metadata"]["file_path"])

embeddings.py
retrieval/stores/langchain/chroma.py
embeddings.py
lua/spotlight/init.lua
